Scanner debug part 2 (done)

2026-03-21 18:48:54 +01:00
parent 8adc374408
commit abb59aabe3
1 changed files with 22 additions and 12 deletions
--- a/src/beaky/scanner/scanner.py
+++ b/src/beaky/scanner/scanner.py
@@ -12,7 +12,7 @@ class Link:
    """Represents a single link row from an Excel sheet.

    Attributes:
-        id: identifier from the sheet (kept as string)
+        id: identifier from the sheet (cast to int)
        url: link to the web page
        date: optional creation date (datetime or None)
    """
@@ -52,8 +52,8 @@ class Links:
        if not header:
            return []

-        # Normalize header names -> index map
-        header_map = {(str(h).strip().lower() if h is not None else ""): i for i, h in enumerate(header)}
+        # Normalize header names -> index map, making sure to use .value
+        header_map = {(str(h.value).strip().lower() if h.value is not None else ""): i for i, h in enumerate(header)}

        def parse_date(v: Any) -> Optional[datetime]:
            if v is None:
@@ -84,29 +84,35 @@ class Links:

        if id_idx is None or url_idx is None:
            # Required columns missing
-            print("Required 'id' or 'url' column missing in header")
+            print(f"Required 'id' or 'url' column missing in header. Found headers: {list(header_map.keys())}")
            return []

-        print(rows)
        for row in rows:
-            print(row)
            try:
-                raw_id = row[id_idx] if id_idx < len(row) else None
-                raw_url = row[url_idx] if url_idx < len(row) else None
-                raw_date = row[date_idx] if (date_idx is not None and date_idx < len(row)) else None
+                # Extract the actual values from the cell objects
+                raw_id = row[id_idx].value if id_idx < len(row) else None
+                raw_url = row[url_idx].value if url_idx < len(row) else None
+                raw_date = row[date_idx].value if (date_idx is not None and date_idx < len(row)) else None

-                if raw_id is None and raw_url is None:
+                if raw_id is None or raw_url is None:
                    # skip empty rows
                    continue

+                # Safely parse the ID to an integer, handling Excel float quirks
+                try:
+                    parsed_id = int(float(raw_id))
+                except (ValueError, TypeError):
+                    # Skip row if ID is missing or invalid text
+                    continue
+
                link = Link(
-                    id=str(raw_id).strip() if raw_id is not None else "",
+                    id=parsed_id,
                    url=str(raw_url).strip() if raw_url is not None else "",
                    date=parse_date(raw_date),
                )
                self.links.append(link)
            except Exception:
-                # Skip problematic rows silently
+                # Skip problematic rows silently (or print(e) for debugging)
                continue

        return self.links
@@ -127,3 +133,7 @@ if __name__ == "__main__":
    links = links_obj.ret_links()
    if not links:
        print("No links returned.")
+    else:
+        print(f"Successfully loaded {len(links)} links!")
+        for link in links:
+            print(link.id, link.url, link.date)