From 6914b620f171972bc08124dcb4776c73310e2510 Mon Sep 17 00:00:00 2001 From: Chlupaty Date: Sun, 22 Mar 2026 01:15:13 +0100 Subject: [PATCH] Created image classifier v.1 --- config/application.yml | 2 + src/beaky/config.py | 4 +- src/beaky/image_classifier/classifier.py | 184 ++++++++++++++++++++++- src/beaky/image_classifier/config.py | 2 +- src/beaky/scanner/scanner.py | 5 +- 5 files changed, 187 insertions(+), 10 deletions(-) diff --git a/config/application.yml b/config/application.yml index dbd40c4..77e1ba7 100644 --- a/config/application.yml +++ b/config/application.yml @@ -2,3 +2,5 @@ path: data/odkazy.xlsx screenshotter: target_path: data/screenshots/ +img_classifier: + target_path: "data/screenshots/" \ No newline at end of file diff --git a/src/beaky/config.py b/src/beaky/config.py index 45dbe12..9c0879e 100644 --- a/src/beaky/config.py +++ b/src/beaky/config.py @@ -1,9 +1,11 @@ from pydantic.dataclasses import dataclass +from beaky.image_classifier.config import ImgClassifierConfig from beaky.screenshotter.config import ScreenshotterConfig @dataclass class Config: path: str - screenshotter: ScreenshotterConfig \ No newline at end of file + screenshotter: ScreenshotterConfig + img_classifer: ImgClassifierConfig diff --git a/src/beaky/image_classifier/classifier.py b/src/beaky/image_classifier/classifier.py index 5c9d2f9..a1bc8d4 100644 --- a/src/beaky/image_classifier/classifier.py +++ b/src/beaky/image_classifier/classifier.py @@ -1,7 +1,183 @@ -from datetime import datetime +import datetime +from pathlib import Path -from beaky.datamodels.ticket import Ticket +import pytesseract + +from beaky.datamodels.ticket import ( + Advance, + Bet, + BetType, + BothTeamScored, + GoalAmount, + GoalHandicap, + Ticket, + UnknownTicket, + WinDrawLose, + WinDrawLoseDouble, + WinLose, +) -def f(path:str, date:datetime) -> list[Ticket]: - ... \ No newline at end of file +def img_to_text(path: str) -> str: + """Given a path to an image, return the text contained in that image. + Bypasses PIL and lets Tesseract read the file directly. + """ + try: + text = pytesseract.image_to_string(path) + return text.strip() + except pytesseract.TesseractNotFoundError: + print("Error: Tesseract executable not found on your system.") + return "" + except Exception as e: + print(f"Error processing {path}: {e}") + return "" + + +def classify(text: str) -> Bet: + """Given text extracted from an image and a date, return a Bet object that is + relevant to that text.""" + + if not text: + return UnknownTicket( + ticketType=BetType.UNKNOWN, + team1Name="N/A", + team2Name="N/A", + date=datetime.datetime.now(), + league="N/A", + raw_text="No text extracted", + ) + + # 1. Defaults & Normalization + text_lower = text.lower() + date_obj = datetime.datetime.now() + team1, team2 = "Unknown", "Unknown" + league = "Unknown" + + # 2. Heuristic extraction of Teams (Looking for "Team A - Team B" patterns) + lines = [line.strip() for line in text.split("\n") if line.strip()] + for line in lines: + if " - " in line or " vs " in line or " v " in line: + # Avoid splitting on hyphens in dates or numbers + if not re.search(r"\d\s*-\s*\d", line): + parts = re.split(r" - | vs | v ", line) + if len(parts) >= 2: + team1, team2 = parts[0].strip(), parts[1].strip() + break + + # 3. Heuristic extraction of Date (Looking for DD.MM. YYYY HH:MM) + date_match = re.search(r"(\d{1,2}\.\s*\d{1,2}\.?\s*(?:\d{2,4})?)\s*(\d{1,2}:\d{2})?", text) + if date_match: + try: + # Fallback to current year if missing, basic parse attempt + date_str = f"{date_match.group(1).replace(' ', '')} {date_match.group(2) or '00:00'}" + if len(date_str.split(".")[2]) <= 5: # Missing year + date_str = date_str.replace(" ", f"{datetime.datetime.now().year} ") + date_obj = datetime.datetime.strptime(date_str, "%d.%m.%Y %H:%M") + except Exception: + pass # Keep default if parsing fails + + # 4. Classification Logic based on keywords + base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league} + + # Advance / Postup + if any(kw in text_lower for kw in ["postup", "postoupí", "advance"]): + return Advance(ticketType=BetType.ADVANCED, **base_args) + + # Both Teams to Score / Oba dají gól + if any(kw in text_lower for kw in ["oba dají gól", "btts", "oba týmy dají gól"]): + return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args) + + # Goal Amount (Over/Under) + if any(kw in text_lower for kw in ["počet gólů", "více než", "méně než", "over", "under"]): + # Attempt to find the goal line (e.g., 2.5, 3.5) + line_match = re.search(r"(\d+\.\d+)", text) + line_val = float(line_match.group(1)) if line_match else 2.5 + is_over = any(kw in text_lower for kw in ["více", "over", "+"]) + + return GoalAmount(ticketType=BetType.GOAL_AMOUNT, line=line_val, over=is_over, **base_args) + + # Goal Handicap + if any(kw in text_lower for kw in ["handicap", "hcp"]): + hcp_match = re.search(r"([+-]?\d+\.\d+)", text) + hcp_val = float(hcp_match.group(1)) if hcp_match else 0.0 + # Simplistic logic: guess team 1 if not explicitly stated + team_bet = "2" if " 2 " in text else "1" + + return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=hcp_val, **base_args) + + # Win Draw Lose Double (1X, X2, 12) + if any(kw in text_lower for kw in ["1x", "x2", "12", "dvojitá šance", "neprohra"]): + bet_type = "01" if "1x" in text_lower else "02" if "x2" in text_lower else "12" + return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=bet_type, **base_args) + + # Win Lose (Draw no bet / Vítěz do rozhodnutí) + if any(kw in text_lower for kw in ["bez remízy", "vítěz do rozhodnutí", "konečný vítěz"]): + bet_type = "2" if re.search(r"\b2\b", text) else "1" + return WinLose(ticketType=BetType.WIN_LOSE, betType=bet_type, **base_args) + + # Win Draw Lose (Standard Match Odds) + if any(kw in text_lower for kw in ["zápas", "výsledek zápasu", "1x2"]): + # Look for isolated 1, X (or 0), or 2 + match_pick = re.search(r"\b(1|x|0|2)\b", text_lower) + bet_type = match_pick.group(1).upper() if match_pick else "1" + if bet_type == "X": + bet_type = "0" + + return WinDrawLose(ticketType=BetType.WIN_DRAW_LOSE, betType=bet_type, **base_args) + + # Fallback Unknown + return UnknownTicket(ticketType=BetType.UNKNOWN, raw_text=text, **base_args) + + +def img_classify(path: str, ticket_id: int) -> Ticket: + """Given a path to an image and a date, return a list of Tickets that are + relevant to that image and date.""" + # Define valid image extensions to ignore system files or text documents + ticket = Ticket(id=ticket_id, bets=[]) + valid_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".webp"} + path_obj = Path(path) + + if not path_obj.is_dir(): + print(f"Error: The path '{path}' is not a valid directory.") + return ticket + + # Iterate through all files in the folder + for file_path in path_obj.iterdir(): + if file_path.is_file() and file_path.suffix.lower() in valid_extensions: + # 1. Extract the text (called separately) + extracted_text = img_to_text(str(file_path)) + print(extracted_text) + + # 2. Classify based on the extracted text (called separately) + result = classify(extracted_text) + + # 3. Add the resulting tickets to our main list + # Support classifier returning either a single Bet or a list of Bet + if result is None: + continue + + if isinstance(result, list): + for r in result: + print( + r.date, + getattr(r, "ticketType", None), + r.team1Name, + r.team2Name, + r.league, + ) + ticket.bets.extend(result) + else: + print( + result.date, + getattr(result, "ticketType", None), + result.team1Name, + result.team2Name, + result.league, + ) + ticket.bets.append(result) + + return ticket + + +if __name__ == "__main__": + img_classify("./data/screenshots/", ticket_id=1) diff --git a/src/beaky/image_classifier/config.py b/src/beaky/image_classifier/config.py index bad5698..8ccdf9d 100644 --- a/src/beaky/image_classifier/config.py +++ b/src/beaky/image_classifier/config.py @@ -3,4 +3,4 @@ from pydantic.dataclasses import dataclass @dataclass class ImgClassifierConfig: - neco: str + target_path: str diff --git a/src/beaky/scanner/scanner.py b/src/beaky/scanner/scanner.py index 8d9bf2e..2d61f32 100644 --- a/src/beaky/scanner/scanner.py +++ b/src/beaky/scanner/scanner.py @@ -21,6 +21,7 @@ class Link: url: str date: Optional[datetime] = None + class Links: def __init__(self, path: str | Config): if isinstance(path, Config): @@ -123,10 +124,6 @@ class Links: return len(self.links) -# Backwards-compatible alias in case other modules referenced Linker -Linker = Links - - if __name__ == "__main__": links_obj = Links("data/odkazy.xlsx") links = links_obj.ret_links()