Add txt file extraction

2026-03-21 18:54:28 +01:00
parent 770966e21f
commit 697fe2548c
1 changed files with 88 additions and 0 deletions
--- a/data/extract_to_excel.py
+++ b/data/extract_to_excel.py
@@ -0,0 +1,88 @@
+import os
+import re
+import sys
+import argparse
+from datetime import datetime
+import pytz
+from openpyxl import Workbook
+
+
+def process_files(starting_id, output_filename="output.xlsx"):
+    # Find all txt files in the current directory
+    txt_files = [f for f in os.listdir('.') if f.endswith('.txt')]
+
+    if not txt_files:
+        print("No .txt files found in the current directory.")
+        return
+
+    # Regex patterns for input data
+    date_pattern = re.compile(r'\[.*?(\d{1,2})\s+(\d{1,2}),\s+(\d{4})\s+at\s+(\d{1,2}:\d{2})\]')
+    url_pattern = re.compile(r'(https?://[^\s]+)')
+
+    # Timezone setup (CET to UTC)
+    local_tz = pytz.timezone("Europe/Prague")
+
+    # Set up the Excel Workbook
+    wb = Workbook()
+    ws = wb.active
+    ws.title = "Fortuna Data"
+    ws.append(["ID", "URL", "Date_UTC"])  # Add headers
+
+    current_id = starting_id
+    success_files = []
+
+    for filename in txt_files:
+        try:
+            with open(filename, 'r', encoding='utf-8') as f:
+                content = f.read()
+
+            dates = date_pattern.findall(content)
+            urls = url_pattern.findall(content)
+
+            # Extract and format the data
+            for i in range(min(len(dates), len(urls))):
+                month, day, year, time_str = dates[i]
+
+                # Parse the datetime from the text file
+                dt_str = f"{year}-{month}-{day} {time_str}"
+                local_dt = datetime.strptime(dt_str, "%Y-%m-%d %H:%M")
+
+                # Convert CET to UTC
+                localized_dt = local_tz.localize(local_dt)
+                utc_dt = localized_dt.astimezone(pytz.utc)
+
+                # NEW: Format to ISO 8601 with T and Z
+                formatted_date = utc_dt.strftime("%Y-%m-%dT%H:%M:%SZ")
+
+                # Add a new row to the Excel sheet
+                ws.append([current_id, urls[i], formatted_date])
+                current_id += 1
+
+            # Queue file for deletion
+            success_files.append(filename)
+
+        except Exception as e:
+            print(f"Error processing {filename}: {e}", file=sys.stderr)
+
+    # Save the Excel file
+    try:
+        wb.save(output_filename)
+        print(f"Successfully saved data to {output_filename}")
+
+        # Clean up only if save was successful
+        for filename in success_files:
+            os.remove(filename)
+            print(f"Deleted: {filename}")
+
+    except Exception as e:
+        print(f"Failed to save {output_filename}. No text files were deleted. Error: {e}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Extract URLs to an Excel file with ISO UTC dates.")
+    parser.add_argument("start_id", type=int, help="Starting ID for the output")
+    parser.add_argument("--output", type=str, default="extracted_data.xlsx",
+                        help="Output Excel filename (default: extracted_data.xlsx)")
+    args = parser.parse_args()
+
+    process_files(args.start_id, args.output)