diff --git a/src/beaky/scanner/scanner.py b/src/beaky/scanner/scanner.py index dd10937..70e2180 100644 --- a/src/beaky/scanner/scanner.py +++ b/src/beaky/scanner/scanner.py @@ -12,7 +12,7 @@ class Link: """Represents a single link row from an Excel sheet. Attributes: - id: identifier from the sheet (kept as string) + id: identifier from the sheet (cast to int) url: link to the web page date: optional creation date (datetime or None) """ @@ -52,8 +52,8 @@ class Links: if not header: return [] - # Normalize header names -> index map - header_map = {(str(h).strip().lower() if h is not None else ""): i for i, h in enumerate(header)} + # Normalize header names -> index map, making sure to use .value + header_map = {(str(h.value).strip().lower() if h.value is not None else ""): i for i, h in enumerate(header)} def parse_date(v: Any) -> Optional[datetime]: if v is None: @@ -84,29 +84,35 @@ class Links: if id_idx is None or url_idx is None: # Required columns missing - print("Required 'id' or 'url' column missing in header") + print(f"Required 'id' or 'url' column missing in header. Found headers: {list(header_map.keys())}") return [] - print(rows) for row in rows: - print(row) try: - raw_id = row[id_idx] if id_idx < len(row) else None - raw_url = row[url_idx] if url_idx < len(row) else None - raw_date = row[date_idx] if (date_idx is not None and date_idx < len(row)) else None + # Extract the actual values from the cell objects + raw_id = row[id_idx].value if id_idx < len(row) else None + raw_url = row[url_idx].value if url_idx < len(row) else None + raw_date = row[date_idx].value if (date_idx is not None and date_idx < len(row)) else None - if raw_id is None and raw_url is None: + if raw_id is None or raw_url is None: # skip empty rows continue + # Safely parse the ID to an integer, handling Excel float quirks + try: + parsed_id = int(float(raw_id)) + except (ValueError, TypeError): + # Skip row if ID is missing or invalid text + continue + link = Link( - id=str(raw_id).strip() if raw_id is not None else "", + id=parsed_id, url=str(raw_url).strip() if raw_url is not None else "", date=parse_date(raw_date), ) self.links.append(link) except Exception: - # Skip problematic rows silently + # Skip problematic rows silently (or print(e) for debugging) continue return self.links @@ -127,3 +133,7 @@ if __name__ == "__main__": links = links_obj.ret_links() if not links: print("No links returned.") + else: + print(f"Successfully loaded {len(links)} links!") + for link in links: + print(link.id, link.url, link.date)