From fd3a8f4955253faa7afd2bd6952199f4afb80acb Mon Sep 17 00:00:00 2001
From: Chlupaty <chlupatykokotko@gmail.com>
Date: Sun, 22 Mar 2026 14:39:14 +0100
Subject: [PATCH] Image classifier debugging block separation and line
 separation

---
 src/beaky/image_classifier/classifier.py | 35 ++++++++++++------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/src/beaky/image_classifier/classifier.py b/src/beaky/image_classifier/classifier.py
index 9d1f598..1ec1afa 100644
--- a/src/beaky/image_classifier/classifier.py
+++ b/src/beaky/image_classifier/classifier.py
@@ -124,11 +124,7 @@ def _parse_block(lines: list[str]) -> Bet:
 
 
 def classify(text: str) -> list[Bet]:
-    """Return a list of Bet objects parsed from OCR `text`.
-
-    Separates the text into chunks per bet based on date/time markers, then
-    parses each chunk individually.
-    """
+    """Return a list of Bet objects parsed from OCR `text`."""
     text = (text or "").strip()
     if not text:
         return [
@@ -145,28 +141,30 @@ def classify(text: str) -> list[Bet]:
     lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
     bets: list[Bet] = []
 
-    # Chunk the OCR output by looking for Date/Time markers
-    # (e.g. "pá 6. 3. 2026 21:00", "dnes 18:30")
     blocks = []
     current_block = []
-    date_pattern = re.compile(r"(\d{1,2}\.\s*\d{1,2}\.\s*\d{4}|\bdnes\b|\bzítra\b).*?\d{2}:\d{2}", re.IGNORECASE)
+
+    # Sports that Fortuna commonly uses as block terminators
+    # We look for the line that contains e.g., "Fotbal / 1. Španělsko"
+    sport_delimiter_pattern = re.compile(
+        r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE
+    )
 
     for ln in lines:
-        if date_pattern.search(ln):
-            if current_block:
-                blocks.append(current_block)
-            current_block = [ln]
-        else:
-            current_block.append(ln)
+        current_block.append(ln)
 
-    if current_block:
+        # If the line starts with "Fotbal /" (or other sports), it is the LAST line of a bet block
+        if sport_delimiter_pattern.match(ln):
+            blocks.append(current_block)
+            current_block = []
+
+    # Catch any dangling lines at the end that didn't have a clean sport delimiter
+    if current_block and len(current_block) > 1:
         blocks.append(current_block)
 
-    if not blocks:
-        blocks = [lines]  # Fallback if regex missed the date boundaries
-
     # Parse each block into a separate Bet object
     for block in blocks:
+        print(f"--- Parsing Block ---\n{block}\n--- End Block ---")
         bets.append(_parse_block(block))
 
     return bets
@@ -180,6 +178,7 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:
     for file in paths:
         file_path = Path(file)
         if not (file_path.is_file() and file_path.suffix.lower() in valid_extensions):
+            print(f"Skipping invalid file: {file}")
             continue
 
         extracted_text = img_to_text(str(file_path))