Compare commits

..

2 Commits

Author SHA1 Message Date
Chlupaty
fd3a8f4955 Image classifier debugging block separation and line separation 2026-03-22 14:39:35 +01:00
Chlupaty
346318177d Classifier work 2026-03-22 14:39:35 +01:00

View File

@@ -1,9 +1,8 @@
import datetime
import logging
import re
from pathlib import Path
from pytesseract import pytesseract
import pytesseract
from beaky.datamodels.ticket import (
Advance,
@@ -21,27 +20,115 @@ from beaky.datamodels.ticket import (
def img_to_text(path: str) -> str:
"""Given a path to an image, return the text contained in that image.
Bypasses PIL and lets Tesseract read the file directly.
"""
"""Read text from image using tesseract; returns empty string on error."""
try:
text = pytesseract.image_to_string(path, lang="ces")
return text.strip()
except pytesseract.TesseractNotFoundError:
print("Error: Tesseract executable not found on your system.")
return ""
return pytesseract.image_to_string(path, lang="ces").strip()
except Exception as e:
print(f"Error processing {path}: {e}")
return ""
def classify(text: str) -> Bet:
"""Given text extracted from an image and a date, return a Bet object that is
relevant to that text."""
def _parse_block(lines: list[str]) -> Bet:
"""Parses a single block of text representing exactly one bet."""
team1, team2 = "Unknown", "Unknown"
league = "Unknown"
date_obj = datetime.datetime.now()
raw_text = "\n".join(lines)
logger = logging.getLogger(__name__)
# 1. Date extraction
if lines:
date_m = re.search(r"(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})", lines[0])
if date_m:
try:
date_obj = datetime.datetime(int(date_m.group(3)), int(date_m.group(2)), int(date_m.group(1)))
except ValueError:
pass
# 2. Teams extraction (usually the line after the date)
if len(lines) > 1:
ln_norm = re.sub(r"[–—−]", "-", lines[1])
m = re.match(r"^(.+?)\s*-\s*(.+)$", ln_norm)
if m:
team1, team2 = m.group(1).strip(), m.group(2).strip()
# 3. League extraction (typically contains a slash and sport name)
for ln in lines:
if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis"]):
league = ln.strip()
break
base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league}
# 4. Bet Type Classification (Strict parsing per tickety.md)
for ln in lines:
lower_line = ln.lower()
# Výsledek zápasu (1X2) - strict match to avoid "Výsledek 1. poločasu"
m_vysl = re.search(r"výsledek zápasu\s*:?\s*(1|0|x|2)$", lower_line)
if m_vysl and "dvojtip" not in lower_line and "remízy" not in lower_line:
pick = m_vysl.group(1).upper()
if pick == "X":
pick = "0"
return WinDrawLose(ticketType=BetType.WIN_DRAW_LOSE, betType=pick, **base_args)
# Výsledek zápasu - dvojtip (01, 02, 12, etc.)
m_dvoj = re.search(r"výsledek zápasu - dvojtip\s*:?\s*(10|01|02|20|12|1x|x1|x2|2x)$", lower_line)
if m_dvoj:
pick = m_dvoj.group(1).replace("x", "0").replace("X", "0")
# Normalize combinations to match literal definitions in ticket.py
if pick in ["10", "01"]:
pick = "01"
elif pick in ["20", "02"]:
pick = "02"
elif pick in ["12", "21"]:
pick = "12"
if pick in ["01", "12", "02"]:
return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=pick, **base_args)
# Výsledek zápasu bez remízy (Sázka bez remízy)
m_bez = re.search(r"bez remízy\s*:?\s*(1|2)$", lower_line)
if m_bez:
return WinLose(ticketType=BetType.WIN_LOSE, betType=m_bez.group(1), **base_args)
# Každý z týmů dá gól v zápasu
m_btts = re.search(r"každý z týmů dá gól.*?:\s*(ano|ne)$", lower_line)
if m_btts:
if m_btts.group(1) == "ano":
return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args)
else:
# `ticket.py` does not currently define resolution for "Ne" logic
break
# Počet gólů v zápasu (e.g., Počet gólů v zápasu 2.5: + 2.5)
m_goals = re.search(r"počet gólů v zápasu.*?:\s*([+-])\s*([\d.]+)", lower_line)
if m_goals and "tým" not in lower_line: # Avoids team-specific goals unless implemented
sign = m_goals.group(1)
val = float(m_goals.group(2))
is_over = sign == "+"
return GoalAmount(ticketType=BetType.GOAL_AMOUNT, line=val, over=is_over, **base_args)
# Kdo postoupí
if "postoupí" in lower_line or "postup" in lower_line:
return Advance(ticketType=BetType.ADVANCED, **base_args)
# Handicap v zápasu
m_hcp = re.search(r"handicap\s*(1|2)\s*:?\s*([+-]?[\d.]+)$", lower_line)
if m_hcp:
team_bet = m_hcp.group(1)
val = float(m_hcp.group(2))
return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=val, **base_args)
# Fallback for doubts, unknown formats (e.g., "Výsledek 1. poločasu"), or parsing failures
return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=raw_text, **base_args)
def classify(text: str) -> list[Bet]:
"""Return a list of Bet objects parsed from OCR `text`."""
text = (text or "").strip()
if not text:
return UnknownBet(
return [
UnknownBet(
ticketType=BetType.UNKNOWN,
team1Name="N/A",
team2Name="N/A",
@@ -49,146 +136,78 @@ def classify(text: str) -> Bet:
league="N/A",
raw_text="No text extracted",
)
]
# 1. Defaults & Normalization
text_lower = text.lower()
date_obj = datetime.datetime.now()
team1, team2 = "Unknown", "Unknown"
league = "Unknown"
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
bets: list[Bet] = []
# 2. Heuristic extraction of Teams (Looking for "Team A - Team B" patterns)
lines = [line.strip() for line in text.split("\n") if line.strip()]
for line in lines:
if " - " in line or " vs " in line or " v " in line:
# Avoid splitting on hyphens in dates or numbers
if not re.search(r"\d\s*-\s*\d", line):
parts = re.split(r" - | vs | v ", line)
if len(parts) >= 2:
team1, team2 = parts[0].strip(), parts[1].strip()
break
blocks = []
current_block = []
# 3. Heuristic extraction of Date (Looking for DD.MM. YYYY HH:MM)
date_match = re.search(r"(\d{1,2}\.\s*\d{1,2}\.?\s*(?:\d{2,4})?)\s*(\d{1,2}:\d{2})?", text)
if date_match:
try:
# Fallback to current year if missing, basic parse attempt
date_str = f"{date_match.group(1).replace(' ', '')} {date_match.group(2) or '00:00'}"
if len(date_str.split(".")[2]) <= 5: # Missing year
date_str = date_str.replace(" ", f"{datetime.datetime.now().year} ")
date_obj = datetime.datetime.strptime(date_str, "%d.%m.%Y %H:%M")
except Exception:
pass # Keep default if parsing fails
# Sports that Fortuna commonly uses as block terminators
# We look for the line that contains e.g., "Fotbal / 1. Španělsko"
sport_delimiter_pattern = re.compile(
r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE
)
# 4. Classification Logic based on keywords
base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league}
for ln in lines:
current_block.append(ln)
# Advance / Postup
if any(kw in text_lower for kw in ["postup", "postoupí", "advance"]):
return Advance(ticketType=BetType.ADVANCED, **base_args)
# If the line starts with "Fotbal /" (or other sports), it is the LAST line of a bet block
if sport_delimiter_pattern.match(ln):
blocks.append(current_block)
current_block = []
# Both Teams to Score / Oba dají gól
if any(kw in text_lower for kw in ["oba dají gól", "btts", "oba týmy dají gól"]):
return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args)
# Catch any dangling lines at the end that didn't have a clean sport delimiter
if current_block and len(current_block) > 1:
blocks.append(current_block)
# Goal Amount (Over/Under)
if any(kw in text_lower for kw in ["počet gólů", "více než", "méně než", "over", "under"]):
# Attempt to find the goal line (e.g., 2.5, 3.5)
line_match = re.search(r"(\d+\.\d+)", text)
line_val = float(line_match.group(1)) if line_match else 2.5
is_over = any(kw in text_lower for kw in ["více", "over", "+"])
# Parse each block into a separate Bet object
for block in blocks:
print(f"--- Parsing Block ---\n{block}\n--- End Block ---")
bets.append(_parse_block(block))
return GoalAmount(ticketType=BetType.GOAL_AMOUNT, line=line_val, over=is_over, **base_args)
# Goal Handicap
if any(kw in text_lower for kw in ["handicap", "hcp"]):
hcp_match = re.search(r"([+-]?\d+\.\d+)", text)
hcp_val = float(hcp_match.group(1)) if hcp_match else 0.0
# Simplistic logic: guess team 1 if not explicitly stated
team_bet = "2" if " 2 " in text else "1"
return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=hcp_val, **base_args)
# Win Draw Lose Double (1X, X2, 12)
if any(kw in text_lower for kw in ["1x", "x2", "12", "dvojitá šance", "neprohra"]):
bet_type = "01" if "1x" in text_lower else "02" if "x2" in text_lower else "12"
return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=bet_type, **base_args)
# Win Lose (Draw no bet / Vítěz do rozhodnutí)
if any(kw in text_lower for kw in ["bez remízy", "vítěz do rozhodnutí", "konečný vítěz"]):
bet_type = "2" if re.search(r"\b2\b", text) else "1"
return WinLose(ticketType=BetType.WIN_LOSE, betType=bet_type, **base_args)
# Win Draw Lose (Standard Match Odds)
if any(kw in text_lower for kw in ["zápas", "výsledek zápasu", "1x2"]):
# Look for isolated 1, X (or 0), or 2
match_pick = re.search(r"\b(1|x|0|2)\b", text_lower)
bet_type = match_pick.group(1).upper() if match_pick else "1"
if bet_type == "X":
bet_type = "0"
return WinDrawLose(ticketType=BetType.WIN_DRAW_LOSE, betType=bet_type, **base_args)
# Fallback Unknown
return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=text, **base_args)
return bets
def img_classify(paths: list[str], ticket_id: int) -> Ticket:
"""Given a path to an image and a date, return a list of Tickets that are
relevant to that image and date."""
# Define valid image extensions to ignore system files or text documents
"""Given a list of file paths to images, classify each and collect bets into a Ticket."""
ticket = Ticket(id=ticket_id, bets=[])
valid_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".webp"}
# Iterate through all files in the folder
for file in paths:
file_path = Path(file)
if file_path.is_file() and file_path.suffix.lower() in valid_extensions:
# 1. Extract the text (called separately)
if not (file_path.is_file() and file_path.suffix.lower() in valid_extensions):
print(f"Skipping invalid file: {file}")
continue
extracted_text = img_to_text(str(file_path))
print(f"--- Extracted Text from {file_path.name} ---")
print(extracted_text)
# 2. Classify based on the extracted text (called separately)
try:
result = classify(extracted_text)
except Exception as exc: # pragma: no cover - defensive fallback
# Ensure result is always defined so downstream code cannot reference an unbound name
print(f"classify() raised an exception: {exc}")
result = UnknownBet(
except Exception as exc:
print(f"classify() error for {file_path}: {exc}")
result = [
UnknownBet(
ticketType=BetType.UNKNOWN,
team1Name="N/A",
team2Name="N/A",
date=datetime.datetime.now(),
league="N/A",
raw_text=extracted_text,
date=datetime.datetime.now(),
)
]
# 3. Add the resulting tickets to our main list
# Support classifier returning either a single Bet or a list of Bet
if result is None:
continue
for bet in result:
print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}")
if isinstance(result, list):
for r in result:
print(
r.date,
getattr(r, "ticketType", None),
r.team1Name,
r.team2Name,
r.league,
)
ticket.bets.extend(result)
else:
print(
result.date,
getattr(result, "ticketType", None),
result.team1Name,
result.team2Name,
result.league,
)
ticket.bets.append(result)
return ticket
if __name__ == "__main__":
img_classify(["./data/screenshots/2.png"], ticket_id=1)
# Test script runner
img_classify(["./data/screenshots/2.png"], ticket_id=2)