From 346318177d341bc7d816e0a4b41244796250d908 Mon Sep 17 00:00:00 2001 From: Chlupaty Date: Sun, 22 Mar 2026 12:42:45 +0100 Subject: [PATCH] Classifier work --- src/beaky/image_classifier/classifier.py | 276 ++++++++++++----------- 1 file changed, 148 insertions(+), 128 deletions(-) diff --git a/src/beaky/image_classifier/classifier.py b/src/beaky/image_classifier/classifier.py index 4d050d6..9d1f598 100644 --- a/src/beaky/image_classifier/classifier.py +++ b/src/beaky/image_classifier/classifier.py @@ -1,9 +1,8 @@ import datetime -import logging import re from pathlib import Path -from pytesseract import pytesseract +import pytesseract from beaky.datamodels.ticket import ( Advance, @@ -21,174 +20,195 @@ from beaky.datamodels.ticket import ( def img_to_text(path: str) -> str: - """Given a path to an image, return the text contained in that image. - Bypasses PIL and lets Tesseract read the file directly. - """ + """Read text from image using tesseract; returns empty string on error.""" try: - text = pytesseract.image_to_string(path, lang="ces") - return text.strip() - except pytesseract.TesseractNotFoundError: - print("Error: Tesseract executable not found on your system.") - return "" + return pytesseract.image_to_string(path, lang="ces").strip() except Exception as e: print(f"Error processing {path}: {e}") return "" -def classify(text: str) -> Bet: - """Given text extracted from an image and a date, return a Bet object that is - relevant to that text.""" - - logger = logging.getLogger(__name__) - if not text: - return UnknownBet( - ticketType=BetType.UNKNOWN, - team1Name="N/A", - team2Name="N/A", - date=datetime.datetime.now(), - league="N/A", - raw_text="No text extracted", - ) - - # 1. Defaults & Normalization - text_lower = text.lower() - date_obj = datetime.datetime.now() +def _parse_block(lines: list[str]) -> Bet: + """Parses a single block of text representing exactly one bet.""" team1, team2 = "Unknown", "Unknown" league = "Unknown" + date_obj = datetime.datetime.now() + raw_text = "\n".join(lines) - # 2. Heuristic extraction of Teams (Looking for "Team A - Team B" patterns) - lines = [line.strip() for line in text.split("\n") if line.strip()] - for line in lines: - if " - " in line or " vs " in line or " v " in line: - # Avoid splitting on hyphens in dates or numbers - if not re.search(r"\d\s*-\s*\d", line): - parts = re.split(r" - | vs | v ", line) - if len(parts) >= 2: - team1, team2 = parts[0].strip(), parts[1].strip() - break + # 1. Date extraction + if lines: + date_m = re.search(r"(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})", lines[0]) + if date_m: + try: + date_obj = datetime.datetime(int(date_m.group(3)), int(date_m.group(2)), int(date_m.group(1))) + except ValueError: + pass - # 3. Heuristic extraction of Date (Looking for DD.MM. YYYY HH:MM) - date_match = re.search(r"(\d{1,2}\.\s*\d{1,2}\.?\s*(?:\d{2,4})?)\s*(\d{1,2}:\d{2})?", text) - if date_match: - try: - # Fallback to current year if missing, basic parse attempt - date_str = f"{date_match.group(1).replace(' ', '')} {date_match.group(2) or '00:00'}" - if len(date_str.split(".")[2]) <= 5: # Missing year - date_str = date_str.replace(" ", f"{datetime.datetime.now().year} ") - date_obj = datetime.datetime.strptime(date_str, "%d.%m.%Y %H:%M") - except Exception: - pass # Keep default if parsing fails + # 2. Teams extraction (usually the line after the date) + if len(lines) > 1: + ln_norm = re.sub(r"[–—−]", "-", lines[1]) + m = re.match(r"^(.+?)\s*-\s*(.+)$", ln_norm) + if m: + team1, team2 = m.group(1).strip(), m.group(2).strip() + + # 3. League extraction (typically contains a slash and sport name) + for ln in lines: + if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis"]): + league = ln.strip() + break - # 4. Classification Logic based on keywords base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league} - # Advance / Postup - if any(kw in text_lower for kw in ["postup", "postoupí", "advance"]): - return Advance(ticketType=BetType.ADVANCED, **base_args) + # 4. Bet Type Classification (Strict parsing per tickety.md) + for ln in lines: + lower_line = ln.lower() - # Both Teams to Score / Oba dají gól - if any(kw in text_lower for kw in ["oba dají gól", "btts", "oba týmy dají gól"]): - return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args) + # Výsledek zápasu (1X2) - strict match to avoid "Výsledek 1. poločasu" + m_vysl = re.search(r"výsledek zápasu\s*:?\s*(1|0|x|2)$", lower_line) + if m_vysl and "dvojtip" not in lower_line and "remízy" not in lower_line: + pick = m_vysl.group(1).upper() + if pick == "X": + pick = "0" + return WinDrawLose(ticketType=BetType.WIN_DRAW_LOSE, betType=pick, **base_args) - # Goal Amount (Over/Under) - if any(kw in text_lower for kw in ["počet gólů", "více než", "méně než", "over", "under"]): - # Attempt to find the goal line (e.g., 2.5, 3.5) - line_match = re.search(r"(\d+\.\d+)", text) - line_val = float(line_match.group(1)) if line_match else 2.5 - is_over = any(kw in text_lower for kw in ["více", "over", "+"]) + # Výsledek zápasu - dvojtip (01, 02, 12, etc.) + m_dvoj = re.search(r"výsledek zápasu - dvojtip\s*:?\s*(10|01|02|20|12|1x|x1|x2|2x)$", lower_line) + if m_dvoj: + pick = m_dvoj.group(1).replace("x", "0").replace("X", "0") + # Normalize combinations to match literal definitions in ticket.py + if pick in ["10", "01"]: + pick = "01" + elif pick in ["20", "02"]: + pick = "02" + elif pick in ["12", "21"]: + pick = "12" - return GoalAmount(ticketType=BetType.GOAL_AMOUNT, line=line_val, over=is_over, **base_args) + if pick in ["01", "12", "02"]: + return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=pick, **base_args) - # Goal Handicap - if any(kw in text_lower for kw in ["handicap", "hcp"]): - hcp_match = re.search(r"([+-]?\d+\.\d+)", text) - hcp_val = float(hcp_match.group(1)) if hcp_match else 0.0 - # Simplistic logic: guess team 1 if not explicitly stated - team_bet = "2" if " 2 " in text else "1" + # Výsledek zápasu bez remízy (Sázka bez remízy) + m_bez = re.search(r"bez remízy\s*:?\s*(1|2)$", lower_line) + if m_bez: + return WinLose(ticketType=BetType.WIN_LOSE, betType=m_bez.group(1), **base_args) - return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=hcp_val, **base_args) + # Každý z týmů dá gól v zápasu + m_btts = re.search(r"každý z týmů dá gól.*?:\s*(ano|ne)$", lower_line) + if m_btts: + if m_btts.group(1) == "ano": + return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args) + else: + # `ticket.py` does not currently define resolution for "Ne" logic + break - # Win Draw Lose Double (1X, X2, 12) - if any(kw in text_lower for kw in ["1x", "x2", "12", "dvojitá šance", "neprohra"]): - bet_type = "01" if "1x" in text_lower else "02" if "x2" in text_lower else "12" - return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=bet_type, **base_args) + # Počet gólů v zápasu (e.g., Počet gólů v zápasu 2.5: + 2.5) + m_goals = re.search(r"počet gólů v zápasu.*?:\s*([+-])\s*([\d.]+)", lower_line) + if m_goals and "tým" not in lower_line: # Avoids team-specific goals unless implemented + sign = m_goals.group(1) + val = float(m_goals.group(2)) + is_over = sign == "+" + return GoalAmount(ticketType=BetType.GOAL_AMOUNT, line=val, over=is_over, **base_args) - # Win Lose (Draw no bet / Vítěz do rozhodnutí) - if any(kw in text_lower for kw in ["bez remízy", "vítěz do rozhodnutí", "konečný vítěz"]): - bet_type = "2" if re.search(r"\b2\b", text) else "1" - return WinLose(ticketType=BetType.WIN_LOSE, betType=bet_type, **base_args) + # Kdo postoupí + if "postoupí" in lower_line or "postup" in lower_line: + return Advance(ticketType=BetType.ADVANCED, **base_args) - # Win Draw Lose (Standard Match Odds) - if any(kw in text_lower for kw in ["zápas", "výsledek zápasu", "1x2"]): - # Look for isolated 1, X (or 0), or 2 - match_pick = re.search(r"\b(1|x|0|2)\b", text_lower) - bet_type = match_pick.group(1).upper() if match_pick else "1" - if bet_type == "X": - bet_type = "0" + # Handicap v zápasu + m_hcp = re.search(r"handicap\s*(1|2)\s*:?\s*([+-]?[\d.]+)$", lower_line) + if m_hcp: + team_bet = m_hcp.group(1) + val = float(m_hcp.group(2)) + return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=val, **base_args) - return WinDrawLose(ticketType=BetType.WIN_DRAW_LOSE, betType=bet_type, **base_args) + # Fallback for doubts, unknown formats (e.g., "Výsledek 1. poločasu"), or parsing failures + return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=raw_text, **base_args) - # Fallback Unknown - return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=text, **base_args) + +def classify(text: str) -> list[Bet]: + """Return a list of Bet objects parsed from OCR `text`. + + Separates the text into chunks per bet based on date/time markers, then + parses each chunk individually. + """ + text = (text or "").strip() + if not text: + return [ + UnknownBet( + ticketType=BetType.UNKNOWN, + team1Name="N/A", + team2Name="N/A", + date=datetime.datetime.now(), + league="N/A", + raw_text="No text extracted", + ) + ] + + lines = [ln.strip() for ln in text.splitlines() if ln.strip()] + bets: list[Bet] = [] + + # Chunk the OCR output by looking for Date/Time markers + # (e.g. "pá 6. 3. 2026 21:00", "dnes 18:30") + blocks = [] + current_block = [] + date_pattern = re.compile(r"(\d{1,2}\.\s*\d{1,2}\.\s*\d{4}|\bdnes\b|\bzítra\b).*?\d{2}:\d{2}", re.IGNORECASE) + + for ln in lines: + if date_pattern.search(ln): + if current_block: + blocks.append(current_block) + current_block = [ln] + else: + current_block.append(ln) + + if current_block: + blocks.append(current_block) + + if not blocks: + blocks = [lines] # Fallback if regex missed the date boundaries + + # Parse each block into a separate Bet object + for block in blocks: + bets.append(_parse_block(block)) + + return bets def img_classify(paths: list[str], ticket_id: int) -> Ticket: - """Given a path to an image and a date, return a list of Tickets that are - relevant to that image and date.""" - # Define valid image extensions to ignore system files or text documents + """Given a list of file paths to images, classify each and collect bets into a Ticket.""" ticket = Ticket(id=ticket_id, bets=[]) valid_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".webp"} - # Iterate through all files in the folder for file in paths: file_path = Path(file) - if file_path.is_file() and file_path.suffix.lower() in valid_extensions: - # 1. Extract the text (called separately) - extracted_text = img_to_text(str(file_path)) - print(extracted_text) + if not (file_path.is_file() and file_path.suffix.lower() in valid_extensions): + continue - # 2. Classify based on the extracted text (called separately) - try: - result = classify(extracted_text) - except Exception as exc: # pragma: no cover - defensive fallback - # Ensure result is always defined so downstream code cannot reference an unbound name - print(f"classify() raised an exception: {exc}") - result = UnknownBet( + extracted_text = img_to_text(str(file_path)) + print(f"--- Extracted Text from {file_path.name} ---") + print(extracted_text) + + try: + result = classify(extracted_text) + except Exception as exc: + print(f"classify() error for {file_path}: {exc}") + result = [ + UnknownBet( ticketType=BetType.UNKNOWN, team1Name="N/A", team2Name="N/A", + date=datetime.datetime.now(), league="N/A", raw_text=extracted_text, - date=datetime.datetime.now(), ) + ] - # 3. Add the resulting tickets to our main list - # Support classifier returning either a single Bet or a list of Bet - if result is None: - continue + for bet in result: + print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}") - if isinstance(result, list): - for r in result: - print( - r.date, - getattr(r, "ticketType", None), - r.team1Name, - r.team2Name, - r.league, - ) - ticket.bets.extend(result) - else: - print( - result.date, - getattr(result, "ticketType", None), - result.team1Name, - result.team2Name, - result.league, - ) - ticket.bets.append(result) + ticket.bets.extend(result) return ticket if __name__ == "__main__": - img_classify(["./data/screenshots/2.png"], ticket_id=1) + # Test script runner + img_classify(["./data/screenshots/2.png"], ticket_id=2)