From fd3a8f4955253faa7afd2bd6952199f4afb80acb Mon Sep 17 00:00:00 2001 From: Chlupaty Date: Sun, 22 Mar 2026 14:39:14 +0100 Subject: [PATCH] Image classifier debugging block separation and line separation --- src/beaky/image_classifier/classifier.py | 35 ++++++++++++------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/beaky/image_classifier/classifier.py b/src/beaky/image_classifier/classifier.py index 9d1f598..1ec1afa 100644 --- a/src/beaky/image_classifier/classifier.py +++ b/src/beaky/image_classifier/classifier.py @@ -124,11 +124,7 @@ def _parse_block(lines: list[str]) -> Bet: def classify(text: str) -> list[Bet]: - """Return a list of Bet objects parsed from OCR `text`. - - Separates the text into chunks per bet based on date/time markers, then - parses each chunk individually. - """ + """Return a list of Bet objects parsed from OCR `text`.""" text = (text or "").strip() if not text: return [ @@ -145,28 +141,30 @@ def classify(text: str) -> list[Bet]: lines = [ln.strip() for ln in text.splitlines() if ln.strip()] bets: list[Bet] = [] - # Chunk the OCR output by looking for Date/Time markers - # (e.g. "pá 6. 3. 2026 21:00", "dnes 18:30") blocks = [] current_block = [] - date_pattern = re.compile(r"(\d{1,2}\.\s*\d{1,2}\.\s*\d{4}|\bdnes\b|\bzítra\b).*?\d{2}:\d{2}", re.IGNORECASE) + + # Sports that Fortuna commonly uses as block terminators + # We look for the line that contains e.g., "Fotbal / 1. Španělsko" + sport_delimiter_pattern = re.compile( + r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE + ) for ln in lines: - if date_pattern.search(ln): - if current_block: - blocks.append(current_block) - current_block = [ln] - else: - current_block.append(ln) + current_block.append(ln) - if current_block: + # If the line starts with "Fotbal /" (or other sports), it is the LAST line of a bet block + if sport_delimiter_pattern.match(ln): + blocks.append(current_block) + current_block = [] + + # Catch any dangling lines at the end that didn't have a clean sport delimiter + if current_block and len(current_block) > 1: blocks.append(current_block) - if not blocks: - blocks = [lines] # Fallback if regex missed the date boundaries - # Parse each block into a separate Bet object for block in blocks: + print(f"--- Parsing Block ---\n{block}\n--- End Block ---") bets.append(_parse_block(block)) return bets @@ -180,6 +178,7 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket: for file in paths: file_path = Path(file) if not (file_path.is_file() and file_path.suffix.lower() in valid_extensions): + print(f"Skipping invalid file: {file}") continue extracted_text = img_to_text(str(file_path))