beaky/data/extract_to_excel.py

import os
import re
import sys
import argparse
from datetime import datetime
import pytz
from openpyxl import Workbook


def process_files(starting_id, output_filename="output.xlsx"):
    # Find all txt files in the current directory
    txt_files = [f for f in os.listdir('.') if f.endswith('.txt')]

    if not txt_files:
        print("No .txt files found in the current directory.")
        return

    # Regex patterns for input data
    date_pattern = re.compile(r'\[.*?(\d{1,2})\s+(\d{1,2}),\s+(\d{4})\s+at\s+(\d{1,2}:\d{2})\]')
    url_pattern = re.compile(r'(https?://[^\s]+)')

    # Timezone setup (CET to UTC)
    local_tz = pytz.timezone("Europe/Prague")

    # Set up the Excel Workbook
    wb = Workbook()
    ws = wb.active
    ws.title = "Fortuna Data"
    ws.append(["ID", "URL", "Date_UTC"])  # Add headers

    current_id = starting_id
    success_files = []

    for filename in txt_files:
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                content = f.read()

            dates = date_pattern.findall(content)
            urls = url_pattern.findall(content)

            # Extract and format the data
            for i in range(min(len(dates), len(urls))):
                month, day, year, time_str = dates[i]

                # Parse the datetime from the text file
                dt_str = f"{year}-{month}-{day} {time_str}"
                local_dt = datetime.strptime(dt_str, "%Y-%m-%d %H:%M")

                # Convert CET to UTC
                localized_dt = local_tz.localize(local_dt)
                utc_dt = localized_dt.astimezone(pytz.utc)

                # NEW: Format to ISO 8601 with T and Z
                formatted_date = utc_dt.strftime("%Y-%m-%dT%H:%M:%SZ")

                # Add a new row to the Excel sheet
                ws.append([current_id, urls[i], formatted_date])
                current_id += 1

            # Queue file for deletion
            success_files.append(filename)

        except Exception as e:
            print(f"Error processing {filename}: {e}", file=sys.stderr)

    # Save the Excel file
    try:
        wb.save(output_filename)
        print(f"Successfully saved data to {output_filename}")

        # Clean up only if save was successful
        for filename in success_files:
            os.remove(filename)
            print(f"Deleted: {filename}")

    except Exception as e:
        print(f"Failed to save {output_filename}. No text files were deleted. Error: {e}", file=sys.stderr)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Extract URLs to an Excel file with ISO UTC dates.")
    parser.add_argument("start_id", type=int, help="Starting ID for the output")
    parser.add_argument("--output", type=str, default="extracted_data.xlsx",
                        help="Output Excel filename (default: extracted_data.xlsx)")
    args = parser.parse_args()

    process_files(args.start_id, args.output)