Add txt file extraction
This commit is contained in:
88
data/extract_to_excel.py
Normal file
88
data/extract_to_excel.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
import pytz
|
||||
from openpyxl import Workbook
|
||||
|
||||
|
||||
def process_files(starting_id, output_filename="output.xlsx"):
|
||||
# Find all txt files in the current directory
|
||||
txt_files = [f for f in os.listdir('.') if f.endswith('.txt')]
|
||||
|
||||
if not txt_files:
|
||||
print("No .txt files found in the current directory.")
|
||||
return
|
||||
|
||||
# Regex patterns for input data
|
||||
date_pattern = re.compile(r'\[.*?(\d{1,2})\s+(\d{1,2}),\s+(\d{4})\s+at\s+(\d{1,2}:\d{2})\]')
|
||||
url_pattern = re.compile(r'(https?://[^\s]+)')
|
||||
|
||||
# Timezone setup (CET to UTC)
|
||||
local_tz = pytz.timezone("Europe/Prague")
|
||||
|
||||
# Set up the Excel Workbook
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "Fortuna Data"
|
||||
ws.append(["ID", "URL", "Date_UTC"]) # Add headers
|
||||
|
||||
current_id = starting_id
|
||||
success_files = []
|
||||
|
||||
for filename in txt_files:
|
||||
try:
|
||||
with open(filename, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
dates = date_pattern.findall(content)
|
||||
urls = url_pattern.findall(content)
|
||||
|
||||
# Extract and format the data
|
||||
for i in range(min(len(dates), len(urls))):
|
||||
month, day, year, time_str = dates[i]
|
||||
|
||||
# Parse the datetime from the text file
|
||||
dt_str = f"{year}-{month}-{day} {time_str}"
|
||||
local_dt = datetime.strptime(dt_str, "%Y-%m-%d %H:%M")
|
||||
|
||||
# Convert CET to UTC
|
||||
localized_dt = local_tz.localize(local_dt)
|
||||
utc_dt = localized_dt.astimezone(pytz.utc)
|
||||
|
||||
# NEW: Format to ISO 8601 with T and Z
|
||||
formatted_date = utc_dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
# Add a new row to the Excel sheet
|
||||
ws.append([current_id, urls[i], formatted_date])
|
||||
current_id += 1
|
||||
|
||||
# Queue file for deletion
|
||||
success_files.append(filename)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {filename}: {e}", file=sys.stderr)
|
||||
|
||||
# Save the Excel file
|
||||
try:
|
||||
wb.save(output_filename)
|
||||
print(f"Successfully saved data to {output_filename}")
|
||||
|
||||
# Clean up only if save was successful
|
||||
for filename in success_files:
|
||||
os.remove(filename)
|
||||
print(f"Deleted: {filename}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to save {output_filename}. No text files were deleted. Error: {e}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Extract URLs to an Excel file with ISO UTC dates.")
|
||||
parser.add_argument("start_id", type=int, help="Starting ID for the output")
|
||||
parser.add_argument("--output", type=str, default="extracted_data.xlsx",
|
||||
help="Output Excel filename (default: extracted_data.xlsx)")
|
||||
args = parser.parse_args()
|
||||
|
||||
process_files(args.start_id, args.output)
|
||||
Reference in New Issue
Block a user