Files
beaky/data/extract_to_excel.py

88 lines
2.9 KiB
Python

import os
import re
import sys
import argparse
from datetime import datetime
import pytz
from openpyxl import Workbook
def process_files(starting_id, output_filename="output.xlsx"):
# Find all txt files in the current directory
txt_files = [f for f in os.listdir('.') if f.endswith('.txt')]
if not txt_files:
print("No .txt files found in the current directory.")
return
# Regex patterns for input data
date_pattern = re.compile(r'\[.*?(\d{1,2})\s+(\d{1,2}),\s+(\d{4})\s+at\s+(\d{1,2}:\d{2})\]')
url_pattern = re.compile(r'(https?://[^\s]+)')
# Timezone setup (CET to UTC)
local_tz = pytz.timezone("Europe/Prague")
# Set up the Excel Workbook
wb = Workbook()
ws = wb.active
ws.title = "Fortuna Data"
ws.append(["ID", "URL", "Date_UTC"]) # Add headers
current_id = starting_id
success_files = []
for filename in txt_files:
try:
with open(filename, 'r', encoding='utf-8') as f:
content = f.read()
dates = date_pattern.findall(content)
urls = url_pattern.findall(content)
# Extract and format the data
for i in range(min(len(dates), len(urls))):
month, day, year, time_str = dates[i]
# Parse the datetime from the text file
dt_str = f"{year}-{month}-{day} {time_str}"
local_dt = datetime.strptime(dt_str, "%Y-%m-%d %H:%M")
# Convert CET to UTC
localized_dt = local_tz.localize(local_dt)
utc_dt = localized_dt.astimezone(pytz.utc)
# NEW: Format to ISO 8601 with T and Z
formatted_date = utc_dt.strftime("%Y-%m-%dT%H:%M:%SZ")
# Add a new row to the Excel sheet
ws.append([current_id, urls[i], formatted_date])
current_id += 1
# Queue file for deletion
success_files.append(filename)
except Exception as e:
print(f"Error processing {filename}: {e}", file=sys.stderr)
# Save the Excel file
try:
wb.save(output_filename)
print(f"Successfully saved data to {output_filename}")
# Clean up only if save was successful
for filename in success_files:
os.remove(filename)
print(f"Deleted: {filename}")
except Exception as e:
print(f"Failed to save {output_filename}. No text files were deleted. Error: {e}", file=sys.stderr)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Extract URLs to an Excel file with ISO UTC dates.")
parser.add_argument("start_id", type=int, help="Starting ID for the output")
parser.add_argument("--output", type=str, default="extracted_data.xlsx",
help="Output Excel filename (default: extracted_data.xlsx)")
args = parser.parse_args()
process_files(args.start_id, args.output)