import os import re import sys import argparse from datetime import datetime import pytz from openpyxl import Workbook def process_files(starting_id, output_filename="output.xlsx"): # Find all txt files in the current directory txt_files = [f for f in os.listdir('.') if f.endswith('.txt')] if not txt_files: print("No .txt files found in the current directory.") return # Regex patterns for input data date_pattern = re.compile(r'\[.*?(\d{1,2})\s+(\d{1,2}),\s+(\d{4})\s+at\s+(\d{1,2}:\d{2})\]') url_pattern = re.compile(r'(https?://[^\s]+)') # Timezone setup (CET to UTC) local_tz = pytz.timezone("Europe/Prague") # Set up the Excel Workbook wb = Workbook() ws = wb.active ws.title = "Fortuna Data" ws.append(["ID", "URL", "Date_UTC"]) # Add headers current_id = starting_id success_files = [] for filename in txt_files: try: with open(filename, 'r', encoding='utf-8') as f: content = f.read() dates = date_pattern.findall(content) urls = url_pattern.findall(content) # Extract and format the data for i in range(min(len(dates), len(urls))): month, day, year, time_str = dates[i] # Parse the datetime from the text file dt_str = f"{year}-{month}-{day} {time_str}" local_dt = datetime.strptime(dt_str, "%Y-%m-%d %H:%M") # Convert CET to UTC localized_dt = local_tz.localize(local_dt) utc_dt = localized_dt.astimezone(pytz.utc) # NEW: Format to ISO 8601 with T and Z formatted_date = utc_dt.strftime("%Y-%m-%dT%H:%M:%SZ") # Add a new row to the Excel sheet ws.append([current_id, urls[i], formatted_date]) current_id += 1 # Queue file for deletion success_files.append(filename) except Exception as e: print(f"Error processing {filename}: {e}", file=sys.stderr) # Save the Excel file try: wb.save(output_filename) print(f"Successfully saved data to {output_filename}") # Clean up only if save was successful for filename in success_files: os.remove(filename) print(f"Deleted: {filename}") except Exception as e: print(f"Failed to save {output_filename}. No text files were deleted. Error: {e}", file=sys.stderr) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Extract URLs to an Excel file with ISO UTC dates.") parser.add_argument("start_id", type=int, help="Starting ID for the output") parser.add_argument("--output", type=str, default="extracted_data.xlsx", help="Output Excel filename (default: extracted_data.xlsx)") args = parser.parse_args() process_files(args.start_id, args.output)