88 lines
2.9 KiB
Python
88 lines
2.9 KiB
Python
import os
|
|
import re
|
|
import sys
|
|
import argparse
|
|
from datetime import datetime
|
|
import pytz
|
|
from openpyxl import Workbook
|
|
|
|
|
|
def process_files(starting_id, output_filename="output.xlsx"):
|
|
# Find all txt files in the current directory
|
|
txt_files = [f for f in os.listdir('.') if f.endswith('.txt')]
|
|
|
|
if not txt_files:
|
|
print("No .txt files found in the current directory.")
|
|
return
|
|
|
|
# Regex patterns for input data
|
|
date_pattern = re.compile(r'\[.*?(\d{1,2})\s+(\d{1,2}),\s+(\d{4})\s+at\s+(\d{1,2}:\d{2})\]')
|
|
url_pattern = re.compile(r'(https?://[^\s]+)')
|
|
|
|
# Timezone setup (CET to UTC)
|
|
local_tz = pytz.timezone("Europe/Prague")
|
|
|
|
# Set up the Excel Workbook
|
|
wb = Workbook()
|
|
ws = wb.active
|
|
ws.title = "Fortuna Data"
|
|
ws.append(["ID", "URL", "Date_UTC"]) # Add headers
|
|
|
|
current_id = starting_id
|
|
success_files = []
|
|
|
|
for filename in txt_files:
|
|
try:
|
|
with open(filename, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
dates = date_pattern.findall(content)
|
|
urls = url_pattern.findall(content)
|
|
|
|
# Extract and format the data
|
|
for i in range(min(len(dates), len(urls))):
|
|
month, day, year, time_str = dates[i]
|
|
|
|
# Parse the datetime from the text file
|
|
dt_str = f"{year}-{month}-{day} {time_str}"
|
|
local_dt = datetime.strptime(dt_str, "%Y-%m-%d %H:%M")
|
|
|
|
# Convert CET to UTC
|
|
localized_dt = local_tz.localize(local_dt)
|
|
utc_dt = localized_dt.astimezone(pytz.utc)
|
|
|
|
# NEW: Format to ISO 8601 with T and Z
|
|
formatted_date = utc_dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
# Add a new row to the Excel sheet
|
|
ws.append([current_id, urls[i], formatted_date])
|
|
current_id += 1
|
|
|
|
# Queue file for deletion
|
|
success_files.append(filename)
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {filename}: {e}", file=sys.stderr)
|
|
|
|
# Save the Excel file
|
|
try:
|
|
wb.save(output_filename)
|
|
print(f"Successfully saved data to {output_filename}")
|
|
|
|
# Clean up only if save was successful
|
|
for filename in success_files:
|
|
os.remove(filename)
|
|
print(f"Deleted: {filename}")
|
|
|
|
except Exception as e:
|
|
print(f"Failed to save {output_filename}. No text files were deleted. Error: {e}", file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Extract URLs to an Excel file with ISO UTC dates.")
|
|
parser.add_argument("start_id", type=int, help="Starting ID for the output")
|
|
parser.add_argument("--output", type=str, default="extracted_data.xlsx",
|
|
help="Output Excel filename (default: extracted_data.xlsx)")
|
|
args = parser.parse_args()
|
|
|
|
process_files(args.start_id, args.output) |