Image classifier debugged line separation and block separation

2026-03-22 15:58:23 +01:00
parent 8edd016e39
commit 60bbc09ccc
1 changed files with 48 additions and 25 deletions
--- a/src/beaky/image_classifier/classifier.py
+++ b/src/beaky/image_classifier/classifier.py
@@ -37,6 +37,7 @@ def _parse_block(lines: list[str]) -> Bet:

    # 1. Date extraction
    if lines:
+        # Regex is forgiving of letters attached to numbers due to OCR (e.g., s07.3.2026)
        date_m = re.search(r"(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})", lines[0])
        if date_m:
            try:
@@ -53,17 +54,17 @@ def _parse_block(lines: list[str]) -> Bet:

    # 3. League extraction (typically contains a slash and sport name)
    for ln in lines:
-        if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis"]):
+        if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis", "Basketbal"]):
            league = ln.strip()
            break

    base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league}

-    # 4. Bet Type Classification (Strict parsing per tickety.md)
+    # 4. Bet Type Classification
    for ln in lines:
        lower_line = ln.lower()

-        # Výsledek zápasu (1X2) - strict match to avoid "Výsledek 1. poločasu"
+        # Výsledek zápasu (1X2)
        m_vysl = re.search(r"výsledek zápasu\s*:?\s*(1|0|x|2)$", lower_line)
        if m_vysl and "dvojtip" not in lower_line and "remízy" not in lower_line:
            pick = m_vysl.group(1).upper()
@@ -75,7 +76,6 @@ def _parse_block(lines: list[str]) -> Bet:
        m_dvoj = re.search(r"výsledek zápasu - dvojtip\s*:?\s*(10|01|02|20|12|1x|x1|x2|2x)$", lower_line)
        if m_dvoj:
            pick = m_dvoj.group(1).replace("x", "0").replace("X", "0")
-            # Normalize combinations to match literal definitions in ticket.py
            if pick in ["10", "01"]:
                pick = "01"
            elif pick in ["20", "02"]:
@@ -86,7 +86,7 @@ def _parse_block(lines: list[str]) -> Bet:
            if pick in ["01", "12", "02"]:
                return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=pick, **base_args)

-        # Výsledek zápasu bez remízy (Sázka bez remízy)
+        # Výsledek zápasu bez remízy
        m_bez = re.search(r"bez remízy\s*:?\s*(1|2)$", lower_line)
        if m_bez:
            return WinLose(ticketType=BetType.WIN_LOSE, betType=m_bez.group(1), **base_args)
@@ -97,12 +97,11 @@ def _parse_block(lines: list[str]) -> Bet:
            if m_btts.group(1) == "ano":
                return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args)
            else:
-                # `ticket.py` does not currently define resolution for "Ne" logic
                break

-        # Počet gólů v zápasu (e.g., Počet gólů v zápasu 2.5: + 2.5)
+        # Počet gólů v zápasu
        m_goals = re.search(r"počet gólů v zápasu.*?:\s*([+-])\s*([\d.]+)", lower_line)
-        if m_goals and "tým" not in lower_line:  # Avoids team-specific goals unless implemented
+        if m_goals and "tým" not in lower_line:
            sign = m_goals.group(1)
            val = float(m_goals.group(2))
            is_over = sign == "+"
@@ -119,7 +118,7 @@ def _parse_block(lines: list[str]) -> Bet:
            val = float(m_hcp.group(2))
            return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=val, **base_args)

-    # Fallback for doubts, unknown formats (e.g., "Výsledek 1. poločasu"), or parsing failures
+    # Fallback
    return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=raw_text, **base_args)


@@ -143,28 +142,51 @@ def classify(text: str) -> list[Bet]:

    blocks = []
    current_block = []
+    in_block = False

-    # Sports that Fortuna commonly uses as block terminators
-    # We look for the line that contains e.g., "Fotbal / 1. Španělsko"
-    sport_delimiter_pattern = re.compile(
-        r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE
-    )
+    # START trigger: Looks for 'dnes', 'zítra', or 'DD.MM.'
+    date_start_pattern = re.compile(r"(\d{1,2}\.\s*\d{1,2}\.|\b(dnes|zítra|zitra|včera|vcera)\b)", re.IGNORECASE)
+
+    # END trigger: Looks for standard Fortuna sport prefixes
+    sport_end_pattern = re.compile(r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE)

    for ln in lines:
-        current_block.append(ln)
+        is_start = date_start_pattern.search(ln)
+        is_end = sport_end_pattern.match(ln)

-        # If the line starts with "Fotbal /" (or other sports), it is the LAST line of a bet block
-        if sport_delimiter_pattern.match(ln):
+        if is_start:
+            # If we somehow hit a start while already in a block (missing end marker fallback),
+            # save the current block before starting a new one.
+            if current_block:
+                print(f"ERROR: SUS block was not properly ended and new block start detected: '{ln}'")
+                blocks.append(current_block)
+            current_block = [ln]
+            in_block = True
+
+        elif is_end:
+            # We hit the league/sport line. Add it, save the block, and close the window.
+            current_block.append(ln)
            blocks.append(current_block)
            current_block = []
+            in_block = False

-    # Catch any dangling lines at the end that didn't have a clean sport delimiter
-    if current_block and len(current_block) > 1:
+        elif in_block:
+            # We are inside a block, gathering standard match info (teams, bet types).
+            current_block.append(ln)
+
+        else:
+            # We are outside a block. This is noise (e.g. "© osmifinále / 2.zápas 0:1" or "170").
+            # We simply ignore it and do nothing.
+            print(f"INFO: Ignoring line outside of any block: '{ln}'")
+            pass
+
+    # Catch any dangling block at the very end of the document
+    if current_block:
        blocks.append(current_block)

    # Parse each block into a separate Bet object
    for block in blocks:
-        print(f"--- Parsing Block ---\n{block}\n--- End Block ---")
+        if len(block) > 1:  # Ensure the block has enough lines to be valid
            bets.append(_parse_block(block))

    return bets
@@ -183,9 +205,10 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:

        extracted_text = img_to_text(str(file_path))
        print(f"--- Extracted Text from {file_path.name} ---")
-        print(extracted_text)
+        # print(extracted_text)

        try:
+            print("--- START OF Classification Result ---")
            result = classify(extracted_text)
        except Exception as exc:
            print(f"classify() error for {file_path}: {exc}")
@@ -200,8 +223,8 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:
                )
            ]

-        for bet in result:
-            print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}")
+        # for bet in result:
+        #     print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}")

        ticket.bets.extend(result)

@@ -210,4 +233,4 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:

 if __name__ == "__main__":
    # Test script runner
-    img_classify(["./data/screenshots/2.png"], ticket_id=2)
+    img_classify(["./data/screenshots/26.png", "./data/screenshots/27.png"], ticket_id=2)