diff --git a/src/beaky/image_classifier/classifier.py b/src/beaky/image_classifier/classifier.py index 1ec1afa..22e4ecb 100644 --- a/src/beaky/image_classifier/classifier.py +++ b/src/beaky/image_classifier/classifier.py @@ -37,6 +37,7 @@ def _parse_block(lines: list[str]) -> Bet: # 1. Date extraction if lines: + # Regex is forgiving of letters attached to numbers due to OCR (e.g., s07.3.2026) date_m = re.search(r"(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})", lines[0]) if date_m: try: @@ -53,17 +54,17 @@ def _parse_block(lines: list[str]) -> Bet: # 3. League extraction (typically contains a slash and sport name) for ln in lines: - if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis"]): + if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis", "Basketbal"]): league = ln.strip() break base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league} - # 4. Bet Type Classification (Strict parsing per tickety.md) + # 4. Bet Type Classification for ln in lines: lower_line = ln.lower() - # Výsledek zápasu (1X2) - strict match to avoid "Výsledek 1. poločasu" + # Výsledek zápasu (1X2) m_vysl = re.search(r"výsledek zápasu\s*:?\s*(1|0|x|2)$", lower_line) if m_vysl and "dvojtip" not in lower_line and "remízy" not in lower_line: pick = m_vysl.group(1).upper() @@ -75,7 +76,6 @@ def _parse_block(lines: list[str]) -> Bet: m_dvoj = re.search(r"výsledek zápasu - dvojtip\s*:?\s*(10|01|02|20|12|1x|x1|x2|2x)$", lower_line) if m_dvoj: pick = m_dvoj.group(1).replace("x", "0").replace("X", "0") - # Normalize combinations to match literal definitions in ticket.py if pick in ["10", "01"]: pick = "01" elif pick in ["20", "02"]: @@ -86,7 +86,7 @@ def _parse_block(lines: list[str]) -> Bet: if pick in ["01", "12", "02"]: return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=pick, **base_args) - # Výsledek zápasu bez remízy (Sázka bez remízy) + # Výsledek zápasu bez remízy m_bez = re.search(r"bez remízy\s*:?\s*(1|2)$", lower_line) if m_bez: return WinLose(ticketType=BetType.WIN_LOSE, betType=m_bez.group(1), **base_args) @@ -97,12 +97,11 @@ def _parse_block(lines: list[str]) -> Bet: if m_btts.group(1) == "ano": return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args) else: - # `ticket.py` does not currently define resolution for "Ne" logic break - # Počet gólů v zápasu (e.g., Počet gólů v zápasu 2.5: + 2.5) + # Počet gólů v zápasu m_goals = re.search(r"počet gólů v zápasu.*?:\s*([+-])\s*([\d.]+)", lower_line) - if m_goals and "tým" not in lower_line: # Avoids team-specific goals unless implemented + if m_goals and "tým" not in lower_line: sign = m_goals.group(1) val = float(m_goals.group(2)) is_over = sign == "+" @@ -119,7 +118,7 @@ def _parse_block(lines: list[str]) -> Bet: val = float(m_hcp.group(2)) return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=val, **base_args) - # Fallback for doubts, unknown formats (e.g., "Výsledek 1. poločasu"), or parsing failures + # Fallback return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=raw_text, **base_args) @@ -143,29 +142,52 @@ def classify(text: str) -> list[Bet]: blocks = [] current_block = [] + in_block = False - # Sports that Fortuna commonly uses as block terminators - # We look for the line that contains e.g., "Fotbal / 1. Španělsko" - sport_delimiter_pattern = re.compile( - r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE - ) + # START trigger: Looks for 'dnes', 'zítra', or 'DD.MM.' + date_start_pattern = re.compile(r"(\d{1,2}\.\s*\d{1,2}\.|\b(dnes|zítra|zitra|včera|vcera)\b)", re.IGNORECASE) + + # END trigger: Looks for standard Fortuna sport prefixes + sport_end_pattern = re.compile(r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE) for ln in lines: - current_block.append(ln) + is_start = date_start_pattern.search(ln) + is_end = sport_end_pattern.match(ln) - # If the line starts with "Fotbal /" (or other sports), it is the LAST line of a bet block - if sport_delimiter_pattern.match(ln): + if is_start: + # If we somehow hit a start while already in a block (missing end marker fallback), + # save the current block before starting a new one. + if current_block: + print(f"ERROR: SUS block was not properly ended and new block start detected: '{ln}'") + blocks.append(current_block) + current_block = [ln] + in_block = True + + elif is_end: + # We hit the league/sport line. Add it, save the block, and close the window. + current_block.append(ln) blocks.append(current_block) current_block = [] + in_block = False - # Catch any dangling lines at the end that didn't have a clean sport delimiter - if current_block and len(current_block) > 1: + elif in_block: + # We are inside a block, gathering standard match info (teams, bet types). + current_block.append(ln) + + else: + # We are outside a block. This is noise (e.g. "© osmifinále / 2.zápas 0:1" or "170"). + # We simply ignore it and do nothing. + print(f"INFO: Ignoring line outside of any block: '{ln}'") + pass + + # Catch any dangling block at the very end of the document + if current_block: blocks.append(current_block) # Parse each block into a separate Bet object for block in blocks: - print(f"--- Parsing Block ---\n{block}\n--- End Block ---") - bets.append(_parse_block(block)) + if len(block) > 1: # Ensure the block has enough lines to be valid + bets.append(_parse_block(block)) return bets @@ -183,9 +205,10 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket: extracted_text = img_to_text(str(file_path)) print(f"--- Extracted Text from {file_path.name} ---") - print(extracted_text) + # print(extracted_text) try: + print("--- START OF Classification Result ---") result = classify(extracted_text) except Exception as exc: print(f"classify() error for {file_path}: {exc}") @@ -200,8 +223,8 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket: ) ] - for bet in result: - print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}") + # for bet in result: + # print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}") ticket.bets.extend(result) @@ -210,4 +233,4 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket: if __name__ == "__main__": # Test script runner - img_classify(["./data/screenshots/2.png"], ticket_id=2) + img_classify(["./data/screenshots/26.png", "./data/screenshots/27.png"], ticket_id=2)