From 47a41828c60b27cbd6d478e409e9b70ccdd0dfba Mon Sep 17 00:00:00 2001 From: Chlupaty Date: Wed, 11 Mar 2026 20:56:07 +0100 Subject: [PATCH] Implement xlsx parsing --- pyproject.toml | 5 ++ src/beaky/scanner/scanner.py | 120 +++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index e1eab1c..3b4654e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,8 +9,13 @@ description = "Scan tickets and decide" requires-python = ">=3.12" dependencies = [ "pillow==12.1.1", +<<<<<<< HEAD "pydantic==2.12.5", "pandas==3.0.1" +======= + "openpyxl>=3.1.0", + "pydantic==2.12.5" +>>>>>>> ec872d0 (Implement xlsx parsing) ] [project.optional-dependencies] diff --git a/src/beaky/scanner/scanner.py b/src/beaky/scanner/scanner.py index 010c5f8..1e82cf7 100644 --- a/src/beaky/scanner/scanner.py +++ b/src/beaky/scanner/scanner.py @@ -1,7 +1,10 @@ from datetime import datetime +from typing import List, Optional, Iterator, Union from pydantic.dataclasses import dataclass +from openpyxl import load_workbook + from beaky.config import Config from beaky.datamodels.scan import Scan @@ -21,3 +24,120 @@ class Scanner: o = Scan(date=datetime.now(), event_name = "neco") return o + + +@dataclass +class Link: + """Represents a single link row from an Excel sheet. + + Attributes: + id: identifier from the sheet (kept as string) + url: link to the web page + date: optional creation date (datetime or None) + """ + id: str + url: str + date: Optional[datetime] = None + + +class Links: + """Loads Link objects from an Excel file (.xlsx). + + Usage: + l = Links(path_to_xlsx) + links = l.ret_links() # returns list[Link] + for link in l: ... + """ + + def __init__(self, path: Union[str, Config]): + # Accept either a raw path string or a Config with .path attribute + if isinstance(path, Config): + self._path = path.path + else: + self._path = path + + self.links: List[Link] = [] + + def ret_links(self) -> List[Link]: + """Read the Excel file at self._path and populate self.links. + + Expects the first sheet to contain a header row with columns that include + at least: 'id', 'link' (or 'url'), and 'date' (case-insensitive). The + method will attempt to parse dates and will store them as datetime when + possible; missing or unparsable dates become None. + """ + wb = load_workbook(filename=self._path, read_only=True, data_only=True) + ws = wb.active + + # Read header row + rows = ws.iter_rows(values_only=True) + try: + header = next(rows) + except StopIteration: + return [] + + if not header: + return [] + + # Normalize header names -> index map + header_map = { (str(h).strip().lower() if h is not None else ""): i for i, h in enumerate(header) } + + # Helper to parse date-like values + def parse_date(v) -> Optional[datetime]: + if v is None: + return None + if isinstance(v, datetime): + return v + s = str(v).strip() + if not s: + return None + # Try ISO + try: + return datetime.fromisoformat(s) + except Exception: + pass + # Try common formats + for fmt in ("%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%m/%d/%Y", "%Y/%m/%d"): + try: + return datetime.strptime(s, fmt) + except Exception: + continue + # Give up + return None + + # Find the column indices we care about + id_idx = header_map.get("id") + url_idx = header_map.get("link") or header_map.get("url") + date_idx = header_map.get("date") + + if id_idx is None or url_idx is None: + # Required columns missing + return [] + + for row in rows: + try: + raw_id = row[id_idx] if id_idx < len(row) else None + raw_url = row[url_idx] if url_idx < len(row) else None + raw_date = row[date_idx] if (date_idx is not None and date_idx < len(row)) else None + + if raw_id is None and raw_url is None: + # skip empty rows + continue + + link = Link(id=str(raw_id).strip() if raw_id is not None else "", url=str(raw_url).strip() if raw_url is not None else "", date=parse_date(raw_date)) + self.links.append(link) + except Exception: + # Skip problematic rows silently + continue + + return self.links + + def __iter__(self) -> Iterator[Link]: + return iter(self.links) + + def __len__(self) -> int: + return len(self.links) + + +# Backwards-compatible alias in case other modules referenced Linker +Linker = Links \ No newline at end of file