Compare commits

..

2 Commits

Author SHA1 Message Date
Chlupaty
fd3a8f4955 Image classifier debugging block separation and line separation 2026-03-22 14:39:35 +01:00
Chlupaty
346318177d Classifier work 2026-03-22 14:39:35 +01:00

View File

@@ -1,9 +1,8 @@
import datetime import datetime
import logging
import re import re
from pathlib import Path from pathlib import Path
from pytesseract import pytesseract import pytesseract
from beaky.datamodels.ticket import ( from beaky.datamodels.ticket import (
Advance, Advance,
@@ -21,174 +20,194 @@ from beaky.datamodels.ticket import (
def img_to_text(path: str) -> str: def img_to_text(path: str) -> str:
"""Given a path to an image, return the text contained in that image. """Read text from image using tesseract; returns empty string on error."""
Bypasses PIL and lets Tesseract read the file directly.
"""
try: try:
text = pytesseract.image_to_string(path, lang="ces") return pytesseract.image_to_string(path, lang="ces").strip()
return text.strip()
except pytesseract.TesseractNotFoundError:
print("Error: Tesseract executable not found on your system.")
return ""
except Exception as e: except Exception as e:
print(f"Error processing {path}: {e}") print(f"Error processing {path}: {e}")
return "" return ""
def classify(text: str) -> Bet: def _parse_block(lines: list[str]) -> Bet:
"""Given text extracted from an image and a date, return a Bet object that is """Parses a single block of text representing exactly one bet."""
relevant to that text."""
logger = logging.getLogger(__name__)
if not text:
return UnknownBet(
ticketType=BetType.UNKNOWN,
team1Name="N/A",
team2Name="N/A",
date=datetime.datetime.now(),
league="N/A",
raw_text="No text extracted",
)
# 1. Defaults & Normalization
text_lower = text.lower()
date_obj = datetime.datetime.now()
team1, team2 = "Unknown", "Unknown" team1, team2 = "Unknown", "Unknown"
league = "Unknown" league = "Unknown"
date_obj = datetime.datetime.now()
raw_text = "\n".join(lines)
# 2. Heuristic extraction of Teams (Looking for "Team A - Team B" patterns) # 1. Date extraction
lines = [line.strip() for line in text.split("\n") if line.strip()] if lines:
for line in lines: date_m = re.search(r"(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})", lines[0])
if " - " in line or " vs " in line or " v " in line: if date_m:
# Avoid splitting on hyphens in dates or numbers try:
if not re.search(r"\d\s*-\s*\d", line): date_obj = datetime.datetime(int(date_m.group(3)), int(date_m.group(2)), int(date_m.group(1)))
parts = re.split(r" - | vs | v ", line) except ValueError:
if len(parts) >= 2: pass
team1, team2 = parts[0].strip(), parts[1].strip()
break
# 3. Heuristic extraction of Date (Looking for DD.MM. YYYY HH:MM) # 2. Teams extraction (usually the line after the date)
date_match = re.search(r"(\d{1,2}\.\s*\d{1,2}\.?\s*(?:\d{2,4})?)\s*(\d{1,2}:\d{2})?", text) if len(lines) > 1:
if date_match: ln_norm = re.sub(r"[–—−]", "-", lines[1])
try: m = re.match(r"^(.+?)\s*-\s*(.+)$", ln_norm)
# Fallback to current year if missing, basic parse attempt if m:
date_str = f"{date_match.group(1).replace(' ', '')} {date_match.group(2) or '00:00'}" team1, team2 = m.group(1).strip(), m.group(2).strip()
if len(date_str.split(".")[2]) <= 5: # Missing year
date_str = date_str.replace(" ", f"{datetime.datetime.now().year} ") # 3. League extraction (typically contains a slash and sport name)
date_obj = datetime.datetime.strptime(date_str, "%d.%m.%Y %H:%M") for ln in lines:
except Exception: if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis"]):
pass # Keep default if parsing fails league = ln.strip()
break
# 4. Classification Logic based on keywords
base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league} base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league}
# Advance / Postup # 4. Bet Type Classification (Strict parsing per tickety.md)
if any(kw in text_lower for kw in ["postup", "postoupí", "advance"]): for ln in lines:
return Advance(ticketType=BetType.ADVANCED, **base_args) lower_line = ln.lower()
# Both Teams to Score / Oba dají gól # Výsledek zápasu (1X2) - strict match to avoid "Výsledek 1. poločasu"
if any(kw in text_lower for kw in ["oba dají gól", "btts", "oba týmy dají gól"]): m_vysl = re.search(r"výsledek zápasu\s*:?\s*(1|0|x|2)$", lower_line)
return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args) if m_vysl and "dvojtip" not in lower_line and "remízy" not in lower_line:
pick = m_vysl.group(1).upper()
if pick == "X":
pick = "0"
return WinDrawLose(ticketType=BetType.WIN_DRAW_LOSE, betType=pick, **base_args)
# Goal Amount (Over/Under) # Výsledek zápasu - dvojtip (01, 02, 12, etc.)
if any(kw in text_lower for kw in ["počet gólů", "více než", "méně než", "over", "under"]): m_dvoj = re.search(r"výsledek zápasu - dvojtip\s*:?\s*(10|01|02|20|12|1x|x1|x2|2x)$", lower_line)
# Attempt to find the goal line (e.g., 2.5, 3.5) if m_dvoj:
line_match = re.search(r"(\d+\.\d+)", text) pick = m_dvoj.group(1).replace("x", "0").replace("X", "0")
line_val = float(line_match.group(1)) if line_match else 2.5 # Normalize combinations to match literal definitions in ticket.py
is_over = any(kw in text_lower for kw in ["více", "over", "+"]) if pick in ["10", "01"]:
pick = "01"
elif pick in ["20", "02"]:
pick = "02"
elif pick in ["12", "21"]:
pick = "12"
return GoalAmount(ticketType=BetType.GOAL_AMOUNT, line=line_val, over=is_over, **base_args) if pick in ["01", "12", "02"]:
return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=pick, **base_args)
# Goal Handicap # Výsledek zápasu bez remízy (Sázka bez remízy)
if any(kw in text_lower for kw in ["handicap", "hcp"]): m_bez = re.search(r"bez remízy\s*:?\s*(1|2)$", lower_line)
hcp_match = re.search(r"([+-]?\d+\.\d+)", text) if m_bez:
hcp_val = float(hcp_match.group(1)) if hcp_match else 0.0 return WinLose(ticketType=BetType.WIN_LOSE, betType=m_bez.group(1), **base_args)
# Simplistic logic: guess team 1 if not explicitly stated
team_bet = "2" if " 2 " in text else "1"
return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=hcp_val, **base_args) # Každý z týmů dá gól v zápasu
m_btts = re.search(r"každý z týmů dá gól.*?:\s*(ano|ne)$", lower_line)
if m_btts:
if m_btts.group(1) == "ano":
return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args)
else:
# `ticket.py` does not currently define resolution for "Ne" logic
break
# Win Draw Lose Double (1X, X2, 12) # Počet gólů v zápasu (e.g., Počet gólů v zápasu 2.5: + 2.5)
if any(kw in text_lower for kw in ["1x", "x2", "12", "dvojitá šance", "neprohra"]): m_goals = re.search(r"počet gólů v zápasu.*?:\s*([+-])\s*([\d.]+)", lower_line)
bet_type = "01" if "1x" in text_lower else "02" if "x2" in text_lower else "12" if m_goals and "tým" not in lower_line: # Avoids team-specific goals unless implemented
return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=bet_type, **base_args) sign = m_goals.group(1)
val = float(m_goals.group(2))
is_over = sign == "+"
return GoalAmount(ticketType=BetType.GOAL_AMOUNT, line=val, over=is_over, **base_args)
# Win Lose (Draw no bet / Vítěz do rozhodnutí) # Kdo postoupí
if any(kw in text_lower for kw in ["bez remízy", "vítěz do rozhodnutí", "konečný vítěz"]): if "postoupí" in lower_line or "postup" in lower_line:
bet_type = "2" if re.search(r"\b2\b", text) else "1" return Advance(ticketType=BetType.ADVANCED, **base_args)
return WinLose(ticketType=BetType.WIN_LOSE, betType=bet_type, **base_args)
# Win Draw Lose (Standard Match Odds) # Handicap v zápasu
if any(kw in text_lower for kw in ["zápas", "výsledek zápasu", "1x2"]): m_hcp = re.search(r"handicap\s*(1|2)\s*:?\s*([+-]?[\d.]+)$", lower_line)
# Look for isolated 1, X (or 0), or 2 if m_hcp:
match_pick = re.search(r"\b(1|x|0|2)\b", text_lower) team_bet = m_hcp.group(1)
bet_type = match_pick.group(1).upper() if match_pick else "1" val = float(m_hcp.group(2))
if bet_type == "X": return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=val, **base_args)
bet_type = "0"
return WinDrawLose(ticketType=BetType.WIN_DRAW_LOSE, betType=bet_type, **base_args) # Fallback for doubts, unknown formats (e.g., "Výsledek 1. poločasu"), or parsing failures
return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=raw_text, **base_args)
# Fallback Unknown
return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=text, **base_args) def classify(text: str) -> list[Bet]:
"""Return a list of Bet objects parsed from OCR `text`."""
text = (text or "").strip()
if not text:
return [
UnknownBet(
ticketType=BetType.UNKNOWN,
team1Name="N/A",
team2Name="N/A",
date=datetime.datetime.now(),
league="N/A",
raw_text="No text extracted",
)
]
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
bets: list[Bet] = []
blocks = []
current_block = []
# Sports that Fortuna commonly uses as block terminators
# We look for the line that contains e.g., "Fotbal / 1. Španělsko"
sport_delimiter_pattern = re.compile(
r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE
)
for ln in lines:
current_block.append(ln)
# If the line starts with "Fotbal /" (or other sports), it is the LAST line of a bet block
if sport_delimiter_pattern.match(ln):
blocks.append(current_block)
current_block = []
# Catch any dangling lines at the end that didn't have a clean sport delimiter
if current_block and len(current_block) > 1:
blocks.append(current_block)
# Parse each block into a separate Bet object
for block in blocks:
print(f"--- Parsing Block ---\n{block}\n--- End Block ---")
bets.append(_parse_block(block))
return bets
def img_classify(paths: list[str], ticket_id: int) -> Ticket: def img_classify(paths: list[str], ticket_id: int) -> Ticket:
"""Given a path to an image and a date, return a list of Tickets that are """Given a list of file paths to images, classify each and collect bets into a Ticket."""
relevant to that image and date."""
# Define valid image extensions to ignore system files or text documents
ticket = Ticket(id=ticket_id, bets=[]) ticket = Ticket(id=ticket_id, bets=[])
valid_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".webp"} valid_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".webp"}
# Iterate through all files in the folder
for file in paths: for file in paths:
file_path = Path(file) file_path = Path(file)
if file_path.is_file() and file_path.suffix.lower() in valid_extensions: if not (file_path.is_file() and file_path.suffix.lower() in valid_extensions):
# 1. Extract the text (called separately) print(f"Skipping invalid file: {file}")
extracted_text = img_to_text(str(file_path)) continue
print(extracted_text)
# 2. Classify based on the extracted text (called separately) extracted_text = img_to_text(str(file_path))
try: print(f"--- Extracted Text from {file_path.name} ---")
result = classify(extracted_text) print(extracted_text)
except Exception as exc: # pragma: no cover - defensive fallback
# Ensure result is always defined so downstream code cannot reference an unbound name try:
print(f"classify() raised an exception: {exc}") result = classify(extracted_text)
result = UnknownBet( except Exception as exc:
print(f"classify() error for {file_path}: {exc}")
result = [
UnknownBet(
ticketType=BetType.UNKNOWN, ticketType=BetType.UNKNOWN,
team1Name="N/A", team1Name="N/A",
team2Name="N/A", team2Name="N/A",
date=datetime.datetime.now(),
league="N/A", league="N/A",
raw_text=extracted_text, raw_text=extracted_text,
date=datetime.datetime.now(),
) )
]
# 3. Add the resulting tickets to our main list for bet in result:
# Support classifier returning either a single Bet or a list of Bet print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}")
if result is None:
continue
if isinstance(result, list): ticket.bets.extend(result)
for r in result:
print(
r.date,
getattr(r, "ticketType", None),
r.team1Name,
r.team2Name,
r.league,
)
ticket.bets.extend(result)
else:
print(
result.date,
getattr(result, "ticketType", None),
result.team1Name,
result.team2Name,
result.league,
)
ticket.bets.append(result)
return ticket return ticket
if __name__ == "__main__": if __name__ == "__main__":
img_classify(["./data/screenshots/2.png"], ticket_id=1) # Test script runner
img_classify(["./data/screenshots/2.png"], ticket_id=2)