Implement xlsx parsing
This commit is contained in:
committed by
Chlupatý Kokotko
parent
e5c31ee0a3
commit
47a41828c6
@@ -9,8 +9,13 @@ description = "Scan tickets and decide"
|
|||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pillow==12.1.1",
|
"pillow==12.1.1",
|
||||||
|
<<<<<<< HEAD
|
||||||
"pydantic==2.12.5",
|
"pydantic==2.12.5",
|
||||||
"pandas==3.0.1"
|
"pandas==3.0.1"
|
||||||
|
=======
|
||||||
|
"openpyxl>=3.1.0",
|
||||||
|
"pydantic==2.12.5"
|
||||||
|
>>>>>>> ec872d0 (Implement xlsx parsing)
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import List, Optional, Iterator, Union
|
||||||
|
|
||||||
from pydantic.dataclasses import dataclass
|
from pydantic.dataclasses import dataclass
|
||||||
|
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
|
||||||
from beaky.config import Config
|
from beaky.config import Config
|
||||||
from beaky.datamodels.scan import Scan
|
from beaky.datamodels.scan import Scan
|
||||||
|
|
||||||
@@ -21,3 +24,120 @@ class Scanner:
|
|||||||
|
|
||||||
o = Scan(date=datetime.now(), event_name = "neco")
|
o = Scan(date=datetime.now(), event_name = "neco")
|
||||||
return o
|
return o
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Link:
|
||||||
|
"""Represents a single link row from an Excel sheet.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
id: identifier from the sheet (kept as string)
|
||||||
|
url: link to the web page
|
||||||
|
date: optional creation date (datetime or None)
|
||||||
|
"""
|
||||||
|
id: str
|
||||||
|
url: str
|
||||||
|
date: Optional[datetime] = None
|
||||||
|
|
||||||
|
|
||||||
|
class Links:
|
||||||
|
"""Loads Link objects from an Excel file (.xlsx).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
l = Links(path_to_xlsx)
|
||||||
|
links = l.ret_links() # returns list[Link]
|
||||||
|
for link in l: ...
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, path: Union[str, Config]):
|
||||||
|
# Accept either a raw path string or a Config with .path attribute
|
||||||
|
if isinstance(path, Config):
|
||||||
|
self._path = path.path
|
||||||
|
else:
|
||||||
|
self._path = path
|
||||||
|
|
||||||
|
self.links: List[Link] = []
|
||||||
|
|
||||||
|
def ret_links(self) -> List[Link]:
|
||||||
|
"""Read the Excel file at self._path and populate self.links.
|
||||||
|
|
||||||
|
Expects the first sheet to contain a header row with columns that include
|
||||||
|
at least: 'id', 'link' (or 'url'), and 'date' (case-insensitive). The
|
||||||
|
method will attempt to parse dates and will store them as datetime when
|
||||||
|
possible; missing or unparsable dates become None.
|
||||||
|
"""
|
||||||
|
wb = load_workbook(filename=self._path, read_only=True, data_only=True)
|
||||||
|
ws = wb.active
|
||||||
|
|
||||||
|
# Read header row
|
||||||
|
rows = ws.iter_rows(values_only=True)
|
||||||
|
try:
|
||||||
|
header = next(rows)
|
||||||
|
except StopIteration:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not header:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Normalize header names -> index map
|
||||||
|
header_map = { (str(h).strip().lower() if h is not None else ""): i for i, h in enumerate(header) }
|
||||||
|
|
||||||
|
# Helper to parse date-like values
|
||||||
|
def parse_date(v) -> Optional[datetime]:
|
||||||
|
if v is None:
|
||||||
|
return None
|
||||||
|
if isinstance(v, datetime):
|
||||||
|
return v
|
||||||
|
s = str(v).strip()
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
|
# Try ISO
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(s)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Try common formats
|
||||||
|
for fmt in ("%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%m/%d/%Y", "%Y/%m/%d"):
|
||||||
|
try:
|
||||||
|
return datetime.strptime(s, fmt)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
# Give up
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Find the column indices we care about
|
||||||
|
id_idx = header_map.get("id")
|
||||||
|
url_idx = header_map.get("link") or header_map.get("url")
|
||||||
|
date_idx = header_map.get("date")
|
||||||
|
|
||||||
|
if id_idx is None or url_idx is None:
|
||||||
|
# Required columns missing
|
||||||
|
return []
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
try:
|
||||||
|
raw_id = row[id_idx] if id_idx < len(row) else None
|
||||||
|
raw_url = row[url_idx] if url_idx < len(row) else None
|
||||||
|
raw_date = row[date_idx] if (date_idx is not None and date_idx < len(row)) else None
|
||||||
|
|
||||||
|
if raw_id is None and raw_url is None:
|
||||||
|
# skip empty rows
|
||||||
|
continue
|
||||||
|
|
||||||
|
link = Link(id=str(raw_id).strip() if raw_id is not None else "", url=str(raw_url).strip() if raw_url is not None else "", date=parse_date(raw_date))
|
||||||
|
self.links.append(link)
|
||||||
|
except Exception:
|
||||||
|
# Skip problematic rows silently
|
||||||
|
continue
|
||||||
|
|
||||||
|
return self.links
|
||||||
|
|
||||||
|
def __iter__(self) -> Iterator[Link]:
|
||||||
|
return iter(self.links)
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self.links)
|
||||||
|
|
||||||
|
|
||||||
|
# Backwards-compatible alias in case other modules referenced Linker
|
||||||
|
Linker = Links
|
||||||
Reference in New Issue
Block a user