Image classifier debugging block separation and line separation

Classifier work
2026-03-22 14:39:35 +01:00 · 2026-03-22 14:39:35 +01:00
1 changed files with 147 additions and 128 deletions
--- a/src/beaky/image_classifier/classifier.py
+++ b/src/beaky/image_classifier/classifier.py
@@ -1,9 +1,8 @@
 import datetime
 import logging
 import re
 from pathlib import Path
-from pytesseract import pytesseract
+import pytesseract
 from beaky.datamodels.ticket import (
    Advance,
@@ -21,174 +20,194 @@ from beaky.datamodels.ticket import (
 def img_to_text(path: str) -> str:
-    """Given a path to an image, return the text contained in that image.
+    """Read text from image using tesseract; returns empty string on error."""
    Bypasses PIL and lets Tesseract read the file directly.
    """
    try:
-        text = pytesseract.image_to_string(path, lang="ces")
+        return pytesseract.image_to_string(path, lang="ces").strip()
        return text.strip()
    except pytesseract.TesseractNotFoundError:
        print("Error: Tesseract executable not found on your system.")
        return ""
    except Exception as e:
        print(f"Error processing {path}: {e}")
        return ""
-def classify(text: str) -> Bet:
+def _parse_block(lines: list[str]) -> Bet:
-    """Given text extracted from an image and a date, return a Bet object that is
+    """Parses a single block of text representing exactly one bet."""
    relevant to that text."""
    logger = logging.getLogger(__name__)
    if not text:
        return UnknownBet(
            ticketType=BetType.UNKNOWN,
            team1Name="N/A",
            team2Name="N/A",
            date=datetime.datetime.now(),
            league="N/A",
            raw_text="No text extracted",
        )
    # 1. Defaults & Normalization
    text_lower = text.lower()
    date_obj = datetime.datetime.now()
    team1, team2 = "Unknown", "Unknown"
    league = "Unknown"
    date_obj = datetime.datetime.now()
    raw_text = "\n".join(lines)
-    # 2. Heuristic extraction of Teams (Looking for "Team A - Team B" patterns)
+    # 1. Date extraction
-    lines = [line.strip() for line in text.split("\n") if line.strip()]
+    if lines:
-    for line in lines:
+        date_m = re.search(r"(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})", lines[0])
-        if " - " in line or " vs " in line or " v " in line:
+        if date_m:
-            # Avoid splitting on hyphens in dates or numbers
+            try:
-            if not re.search(r"\d\s*-\s*\d", line):
+                date_obj = datetime.datetime(int(date_m.group(3)), int(date_m.group(2)), int(date_m.group(1)))
-                parts = re.split(r" - | vs | v ", line)
+            except ValueError:
-                if len(parts) >= 2:
+                pass
                    team1, team2 = parts[0].strip(), parts[1].strip()
                    break
-    # 3. Heuristic extraction of Date (Looking for DD.MM. YYYY HH:MM)
+    # 2. Teams extraction (usually the line after the date)
-    date_match = re.search(r"(\d{1,2}\.\s*\d{1,2}\.?\s*(?:\d{2,4})?)\s*(\d{1,2}:\d{2})?", text)
+    if len(lines) > 1:
-    if date_match:
+        ln_norm = re.sub(r"[–—−]", "-", lines[1])
-        try:
+        m = re.match(r"^(.+?)\s*-\s*(.+)$", ln_norm)
-            # Fallback to current year if missing, basic parse attempt
+        if m:
-            date_str = f"{date_match.group(1).replace(' ', '')} {date_match.group(2) or '00:00'}"
+            team1, team2 = m.group(1).strip(), m.group(2).strip()
-            if len(date_str.split(".")[2]) <= 5:  # Missing year
+
-                date_str = date_str.replace(" ", f"{datetime.datetime.now().year} ")
+    # 3. League extraction (typically contains a slash and sport name)
-            date_obj = datetime.datetime.strptime(date_str, "%d.%m.%Y %H:%M")
+    for ln in lines:
-        except Exception:
+        if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis"]):
-            pass  # Keep default if parsing fails
+            league = ln.strip()
            break
    # 4. Classification Logic based on keywords
    base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league}
-    # Advance / Postup
+    # 4. Bet Type Classification (Strict parsing per tickety.md)
-    if any(kw in text_lower for kw in ["postup", "postoupí", "advance"]):
+    for ln in lines:
-        return Advance(ticketType=BetType.ADVANCED, **base_args)
+        lower_line = ln.lower()
-    # Both Teams to Score / Oba dají gól
+        # Výsledek zápasu (1X2) - strict match to avoid "Výsledek 1. poločasu"
-    if any(kw in text_lower for kw in ["oba dají gól", "btts", "oba týmy dají gól"]):
+        m_vysl = re.search(r"výsledek zápasu\s*:?\s*(1|0|x|2)$", lower_line)
-        return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args)
+        if m_vysl and "dvojtip" not in lower_line and "remízy" not in lower_line:
            pick = m_vysl.group(1).upper()
            if pick == "X":
                pick = "0"
            return WinDrawLose(ticketType=BetType.WIN_DRAW_LOSE, betType=pick, **base_args)
-    # Goal Amount (Over/Under)
+        # Výsledek zápasu - dvojtip (01, 02, 12, etc.)
-    if any(kw in text_lower for kw in ["počet gólů", "více než", "méně než", "over", "under"]):
+        m_dvoj = re.search(r"výsledek zápasu - dvojtip\s*:?\s*(10|01|02|20|12|1x|x1|x2|2x)$", lower_line)
-        # Attempt to find the goal line (e.g., 2.5, 3.5)
+        if m_dvoj:
-        line_match = re.search(r"(\d+\.\d+)", text)
+            pick = m_dvoj.group(1).replace("x", "0").replace("X", "0")
-        line_val = float(line_match.group(1)) if line_match else 2.5
+            # Normalize combinations to match literal definitions in ticket.py
-        is_over = any(kw in text_lower for kw in ["více", "over", "+"])
+            if pick in ["10", "01"]:
                pick = "01"
            elif pick in ["20", "02"]:
                pick = "02"
            elif pick in ["12", "21"]:
                pick = "12"
-        return GoalAmount(ticketType=BetType.GOAL_AMOUNT, line=line_val, over=is_over, **base_args)
+            if pick in ["01", "12", "02"]:
                return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=pick, **base_args)
-    # Goal Handicap
+        # Výsledek zápasu bez remízy (Sázka bez remízy)
-    if any(kw in text_lower for kw in ["handicap", "hcp"]):
+        m_bez = re.search(r"bez remízy\s*:?\s*(1|2)$", lower_line)
-        hcp_match = re.search(r"([+-]?\d+\.\d+)", text)
+        if m_bez:
-        hcp_val = float(hcp_match.group(1)) if hcp_match else 0.0
+            return WinLose(ticketType=BetType.WIN_LOSE, betType=m_bez.group(1), **base_args)
        # Simplistic logic: guess team 1 if not explicitly stated
        team_bet = "2" if " 2 " in text else "1"
-        return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=hcp_val, **base_args)
+        # Každý z týmů dá gól v zápasu
        m_btts = re.search(r"každý z týmů dá gól.*?:\s*(ano|ne)$", lower_line)
        if m_btts:
            if m_btts.group(1) == "ano":
                return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args)
            else:
                # `ticket.py` does not currently define resolution for "Ne" logic
                break
-    # Win Draw Lose Double (1X, X2, 12)
+        # Počet gólů v zápasu (e.g., Počet gólů v zápasu 2.5: + 2.5)
-    if any(kw in text_lower for kw in ["1x", "x2", "12", "dvojitá šance", "neprohra"]):
+        m_goals = re.search(r"počet gólů v zápasu.*?:\s*([+-])\s*([\d.]+)", lower_line)
-        bet_type = "01" if "1x" in text_lower else "02" if "x2" in text_lower else "12"
+        if m_goals and "tým" not in lower_line:  # Avoids team-specific goals unless implemented
-        return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=bet_type, **base_args)
+            sign = m_goals.group(1)
            val = float(m_goals.group(2))
            is_over = sign == "+"
            return GoalAmount(ticketType=BetType.GOAL_AMOUNT, line=val, over=is_over, **base_args)
-    # Win Lose (Draw no bet / Vítěz do rozhodnutí)
+        # Kdo postoupí
-    if any(kw in text_lower for kw in ["bez remízy", "vítěz do rozhodnutí", "konečný vítěz"]):
+        if "postoupí" in lower_line or "postup" in lower_line:
-        bet_type = "2" if re.search(r"\b2\b", text) else "1"
+            return Advance(ticketType=BetType.ADVANCED, **base_args)
        return WinLose(ticketType=BetType.WIN_LOSE, betType=bet_type, **base_args)
-    # Win Draw Lose (Standard Match Odds)
+        # Handicap v zápasu
-    if any(kw in text_lower for kw in ["zápas", "výsledek zápasu", "1x2"]):
+        m_hcp = re.search(r"handicap\s*(1|2)\s*:?\s*([+-]?[\d.]+)$", lower_line)
-        # Look for isolated 1, X (or 0), or 2
+        if m_hcp:
-        match_pick = re.search(r"\b(1|x|0|2)\b", text_lower)
+            team_bet = m_hcp.group(1)
-        bet_type = match_pick.group(1).upper() if match_pick else "1"
+            val = float(m_hcp.group(2))
-        if bet_type == "X":
+            return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=val, **base_args)
            bet_type = "0"
-        return WinDrawLose(ticketType=BetType.WIN_DRAW_LOSE, betType=bet_type, **base_args)
+    # Fallback for doubts, unknown formats (e.g., "Výsledek 1. poločasu"), or parsing failures
    return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=raw_text, **base_args)
-    # Fallback Unknown
+
-    return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=text, **base_args)
+def classify(text: str) -> list[Bet]:
    """Return a list of Bet objects parsed from OCR `text`."""
    text = (text or "").strip()
    if not text:
        return [
            UnknownBet(
                ticketType=BetType.UNKNOWN,
                team1Name="N/A",
                team2Name="N/A",
                date=datetime.datetime.now(),
                league="N/A",
                raw_text="No text extracted",
            )
        ]
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    bets: list[Bet] = []
    blocks = []
    current_block = []
    # Sports that Fortuna commonly uses as block terminators
    # We look for the line that contains e.g., "Fotbal / 1. Španělsko"
    sport_delimiter_pattern = re.compile(
        r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE
    )
    for ln in lines:
        current_block.append(ln)
        # If the line starts with "Fotbal /" (or other sports), it is the LAST line of a bet block
        if sport_delimiter_pattern.match(ln):
            blocks.append(current_block)
            current_block = []
    # Catch any dangling lines at the end that didn't have a clean sport delimiter
    if current_block and len(current_block) > 1:
        blocks.append(current_block)
    # Parse each block into a separate Bet object
    for block in blocks:
        print(f"--- Parsing Block ---\n{block}\n--- End Block ---")
        bets.append(_parse_block(block))
    return bets
 def img_classify(paths: list[str], ticket_id: int) -> Ticket:
-    """Given a path to an image and a date, return a list of Tickets that are
+    """Given a list of file paths to images, classify each and collect bets into a Ticket."""
    relevant to that image and date."""
    # Define valid image extensions to ignore system files or text documents
    ticket = Ticket(id=ticket_id, bets=[])
    valid_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".webp"}
    # Iterate through all files in the folder
    for file in paths:
        file_path = Path(file)
-        if file_path.is_file() and file_path.suffix.lower() in valid_extensions:
+        if not (file_path.is_file() and file_path.suffix.lower() in valid_extensions):
-            # 1. Extract the text (called separately)
+            print(f"Skipping invalid file: {file}")
-            extracted_text = img_to_text(str(file_path))
+            continue
            print(extracted_text)
-            # 2. Classify based on the extracted text (called separately)
+        extracted_text = img_to_text(str(file_path))
-            try:
+        print(f"--- Extracted Text from {file_path.name} ---")
-                result = classify(extracted_text)
+        print(extracted_text)
-            except Exception as exc:  # pragma: no cover - defensive fallback
+
-                # Ensure result is always defined so downstream code cannot reference an unbound name
+        try:
-                print(f"classify() raised an exception: {exc}")
+            result = classify(extracted_text)
-                result = UnknownBet(
+        except Exception as exc:
            print(f"classify() error for {file_path}: {exc}")
            result = [
                UnknownBet(
                    ticketType=BetType.UNKNOWN,
                    team1Name="N/A",
                    team2Name="N/A",
                    date=datetime.datetime.now(),
                    league="N/A",
                    raw_text=extracted_text,
                    date=datetime.datetime.now(),
                )
            ]
-            # 3. Add the resulting tickets to our main list
+        for bet in result:
-            # Support classifier returning either a single Bet or a list of Bet
+            print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}")
            if result is None:
                continue
-            if isinstance(result, list):
+        ticket.bets.extend(result)
                for r in result:
                    print(
                        r.date,
                        getattr(r, "ticketType", None),
                        r.team1Name,
                        r.team2Name,
                        r.league,
                    )
                ticket.bets.extend(result)
            else:
                print(
                    result.date,
                    getattr(result, "ticketType", None),
                    result.team1Name,
                    result.team2Name,
                    result.league,
                )
                ticket.bets.append(result)
    return ticket
 if __name__ == "__main__":
-    img_classify(["./data/screenshots/2.png"], ticket_id=1)
+    # Test script runner
    img_classify(["./data/screenshots/2.png"], ticket_id=2)
Author	SHA1	Message	Date
Chlupaty	fd3a8f4955	Image classifier debugging block separation and line separation	2026-03-22 14:39:35 +01:00
Chlupaty	346318177d	Classifier work	2026-03-22 14:39:35 +01:00