From 697fe2548c3079f368d78817ad9af851b8c62c34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Janek=20Hlavat=C3=BD?= Date: Sat, 21 Mar 2026 18:54:28 +0100 Subject: [PATCH] Add txt file extraction --- data/extract_to_excel.py | 88 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 data/extract_to_excel.py diff --git a/data/extract_to_excel.py b/data/extract_to_excel.py new file mode 100644 index 0000000..358dc33 --- /dev/null +++ b/data/extract_to_excel.py @@ -0,0 +1,88 @@ +import os +import re +import sys +import argparse +from datetime import datetime +import pytz +from openpyxl import Workbook + + +def process_files(starting_id, output_filename="output.xlsx"): + # Find all txt files in the current directory + txt_files = [f for f in os.listdir('.') if f.endswith('.txt')] + + if not txt_files: + print("No .txt files found in the current directory.") + return + + # Regex patterns for input data + date_pattern = re.compile(r'\[.*?(\d{1,2})\s+(\d{1,2}),\s+(\d{4})\s+at\s+(\d{1,2}:\d{2})\]') + url_pattern = re.compile(r'(https?://[^\s]+)') + + # Timezone setup (CET to UTC) + local_tz = pytz.timezone("Europe/Prague") + + # Set up the Excel Workbook + wb = Workbook() + ws = wb.active + ws.title = "Fortuna Data" + ws.append(["ID", "URL", "Date_UTC"]) # Add headers + + current_id = starting_id + success_files = [] + + for filename in txt_files: + try: + with open(filename, 'r', encoding='utf-8') as f: + content = f.read() + + dates = date_pattern.findall(content) + urls = url_pattern.findall(content) + + # Extract and format the data + for i in range(min(len(dates), len(urls))): + month, day, year, time_str = dates[i] + + # Parse the datetime from the text file + dt_str = f"{year}-{month}-{day} {time_str}" + local_dt = datetime.strptime(dt_str, "%Y-%m-%d %H:%M") + + # Convert CET to UTC + localized_dt = local_tz.localize(local_dt) + utc_dt = localized_dt.astimezone(pytz.utc) + + # NEW: Format to ISO 8601 with T and Z + formatted_date = utc_dt.strftime("%Y-%m-%dT%H:%M:%SZ") + + # Add a new row to the Excel sheet + ws.append([current_id, urls[i], formatted_date]) + current_id += 1 + + # Queue file for deletion + success_files.append(filename) + + except Exception as e: + print(f"Error processing {filename}: {e}", file=sys.stderr) + + # Save the Excel file + try: + wb.save(output_filename) + print(f"Successfully saved data to {output_filename}") + + # Clean up only if save was successful + for filename in success_files: + os.remove(filename) + print(f"Deleted: {filename}") + + except Exception as e: + print(f"Failed to save {output_filename}. No text files were deleted. Error: {e}", file=sys.stderr) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Extract URLs to an Excel file with ISO UTC dates.") + parser.add_argument("start_id", type=int, help="Starting ID for the output") + parser.add_argument("--output", type=str, default="extracted_data.xlsx", + help="Output Excel filename (default: extracted_data.xlsx)") + args = parser.parse_args() + + process_files(args.start_id, args.output) \ No newline at end of file