Implement xlsx parsing

2026-03-11 20:56:07 +01:00
parent e5c31ee0a3
commit 47a41828c6
2 changed files with 125 additions and 0 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,8 +9,13 @@ description = "Scan tickets and decide"
 requires-python = ">=3.12"
 dependencies = [
    "pillow==12.1.1",
+<<<<<<< HEAD
    "pydantic==2.12.5",
    "pandas==3.0.1"
+=======
+    "openpyxl>=3.1.0",
+    "pydantic==2.12.5"
+>>>>>>> ec872d0 (Implement xlsx parsing)
 ]

 [project.optional-dependencies]
--- a/src/beaky/scanner/scanner.py
+++ b/src/beaky/scanner/scanner.py
@@ -1,7 +1,10 @@
 from datetime import datetime
+from typing import List, Optional, Iterator, Union

 from pydantic.dataclasses import dataclass

+from openpyxl import load_workbook
+
 from beaky.config import Config
 from beaky.datamodels.scan import Scan

@@ -21,3 +24,120 @@ class Scanner:

        o = Scan(date=datetime.now(), event_name = "neco")
        return o
+
+
+@dataclass
+class Link:
+    """Represents a single link row from an Excel sheet.
+
+    Attributes:
+        id: identifier from the sheet (kept as string)
+        url: link to the web page
+        date: optional creation date (datetime or None)
+    """
+    id: str
+    url: str
+    date: Optional[datetime] = None
+
+
+class Links:
+    """Loads Link objects from an Excel file (.xlsx).
+
+    Usage:
+        l = Links(path_to_xlsx)
+        links = l.ret_links()  # returns list[Link]
+        for link in l: ...
+    """
+
+    def __init__(self, path: Union[str, Config]):
+        # Accept either a raw path string or a Config with .path attribute
+        if isinstance(path, Config):
+            self._path = path.path
+        else:
+            self._path = path
+
+        self.links: List[Link] = []
+
+    def ret_links(self) -> List[Link]:
+        """Read the Excel file at self._path and populate self.links.
+
+        Expects the first sheet to contain a header row with columns that include
+        at least: 'id', 'link' (or 'url'), and 'date' (case-insensitive). The
+        method will attempt to parse dates and will store them as datetime when
+        possible; missing or unparsable dates become None.
+        """
+        wb = load_workbook(filename=self._path, read_only=True, data_only=True)
+        ws = wb.active
+
+        # Read header row
+        rows = ws.iter_rows(values_only=True)
+        try:
+            header = next(rows)
+        except StopIteration:
+            return []
+
+        if not header:
+            return []
+
+        # Normalize header names -> index map
+        header_map = { (str(h).strip().lower() if h is not None else ""): i for i, h in enumerate(header) }
+
+        # Helper to parse date-like values
+        def parse_date(v) -> Optional[datetime]:
+            if v is None:
+                return None
+            if isinstance(v, datetime):
+                return v
+            s = str(v).strip()
+            if not s:
+                return None
+            # Try ISO
+            try:
+                return datetime.fromisoformat(s)
+            except Exception:
+                pass
+            # Try common formats
+            for fmt in ("%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%m/%d/%Y", "%Y/%m/%d"):
+                try:
+                    return datetime.strptime(s, fmt)
+                except Exception:
+                    continue
+            # Give up
+            return None
+
+        # Find the column indices we care about
+        id_idx = header_map.get("id")
+        url_idx = header_map.get("link") or header_map.get("url")
+        date_idx = header_map.get("date")
+
+        if id_idx is None or url_idx is None:
+            # Required columns missing
+            return []
+
+        for row in rows:
+            try:
+                raw_id = row[id_idx] if id_idx < len(row) else None
+                raw_url = row[url_idx] if url_idx < len(row) else None
+                raw_date = row[date_idx] if (date_idx is not None and date_idx < len(row)) else None
+
+                if raw_id is None and raw_url is None:
+                    # skip empty rows
+                    continue
+
+                link = Link(id=str(raw_id).strip() if raw_id is not None else "", url=str(raw_url).strip() if raw_url is not None else "", date=parse_date(raw_date))
+                self.links.append(link)
+            except Exception:
+                # Skip problematic rows silently
+                continue
+
+        return self.links
+
+    def __iter__(self) -> Iterator[Link]:
+        return iter(self.links)
+
+    def __len__(self) -> int:
+        return len(self.links)
+
+
+# Backwards-compatible alias in case other modules referenced Linker
+Linker = Links