Classifier work
This commit is contained in:
@@ -1,9 +1,8 @@
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from pytesseract import pytesseract
|
||||
import pytesseract
|
||||
|
||||
from beaky.datamodels.ticket import (
|
||||
Advance,
|
||||
@@ -21,27 +20,119 @@ from beaky.datamodels.ticket import (
|
||||
|
||||
|
||||
def img_to_text(path: str) -> str:
|
||||
"""Given a path to an image, return the text contained in that image.
|
||||
Bypasses PIL and lets Tesseract read the file directly.
|
||||
"""
|
||||
"""Read text from image using tesseract; returns empty string on error."""
|
||||
try:
|
||||
text = pytesseract.image_to_string(path, lang="ces")
|
||||
return text.strip()
|
||||
except pytesseract.TesseractNotFoundError:
|
||||
print("Error: Tesseract executable not found on your system.")
|
||||
return ""
|
||||
return pytesseract.image_to_string(path, lang="ces").strip()
|
||||
except Exception as e:
|
||||
print(f"Error processing {path}: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def classify(text: str) -> Bet:
|
||||
"""Given text extracted from an image and a date, return a Bet object that is
|
||||
relevant to that text."""
|
||||
def _parse_block(lines: list[str]) -> Bet:
|
||||
"""Parses a single block of text representing exactly one bet."""
|
||||
team1, team2 = "Unknown", "Unknown"
|
||||
league = "Unknown"
|
||||
date_obj = datetime.datetime.now()
|
||||
raw_text = "\n".join(lines)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# 1. Date extraction
|
||||
if lines:
|
||||
date_m = re.search(r"(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})", lines[0])
|
||||
if date_m:
|
||||
try:
|
||||
date_obj = datetime.datetime(int(date_m.group(3)), int(date_m.group(2)), int(date_m.group(1)))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 2. Teams extraction (usually the line after the date)
|
||||
if len(lines) > 1:
|
||||
ln_norm = re.sub(r"[–—−]", "-", lines[1])
|
||||
m = re.match(r"^(.+?)\s*-\s*(.+)$", ln_norm)
|
||||
if m:
|
||||
team1, team2 = m.group(1).strip(), m.group(2).strip()
|
||||
|
||||
# 3. League extraction (typically contains a slash and sport name)
|
||||
for ln in lines:
|
||||
if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis"]):
|
||||
league = ln.strip()
|
||||
break
|
||||
|
||||
base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league}
|
||||
|
||||
# 4. Bet Type Classification (Strict parsing per tickety.md)
|
||||
for ln in lines:
|
||||
lower_line = ln.lower()
|
||||
|
||||
# Výsledek zápasu (1X2) - strict match to avoid "Výsledek 1. poločasu"
|
||||
m_vysl = re.search(r"výsledek zápasu\s*:?\s*(1|0|x|2)$", lower_line)
|
||||
if m_vysl and "dvojtip" not in lower_line and "remízy" not in lower_line:
|
||||
pick = m_vysl.group(1).upper()
|
||||
if pick == "X":
|
||||
pick = "0"
|
||||
return WinDrawLose(ticketType=BetType.WIN_DRAW_LOSE, betType=pick, **base_args)
|
||||
|
||||
# Výsledek zápasu - dvojtip (01, 02, 12, etc.)
|
||||
m_dvoj = re.search(r"výsledek zápasu - dvojtip\s*:?\s*(10|01|02|20|12|1x|x1|x2|2x)$", lower_line)
|
||||
if m_dvoj:
|
||||
pick = m_dvoj.group(1).replace("x", "0").replace("X", "0")
|
||||
# Normalize combinations to match literal definitions in ticket.py
|
||||
if pick in ["10", "01"]:
|
||||
pick = "01"
|
||||
elif pick in ["20", "02"]:
|
||||
pick = "02"
|
||||
elif pick in ["12", "21"]:
|
||||
pick = "12"
|
||||
|
||||
if pick in ["01", "12", "02"]:
|
||||
return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=pick, **base_args)
|
||||
|
||||
# Výsledek zápasu bez remízy (Sázka bez remízy)
|
||||
m_bez = re.search(r"bez remízy\s*:?\s*(1|2)$", lower_line)
|
||||
if m_bez:
|
||||
return WinLose(ticketType=BetType.WIN_LOSE, betType=m_bez.group(1), **base_args)
|
||||
|
||||
# Každý z týmů dá gól v zápasu
|
||||
m_btts = re.search(r"každý z týmů dá gól.*?:\s*(ano|ne)$", lower_line)
|
||||
if m_btts:
|
||||
if m_btts.group(1) == "ano":
|
||||
return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args)
|
||||
else:
|
||||
# `ticket.py` does not currently define resolution for "Ne" logic
|
||||
break
|
||||
|
||||
# Počet gólů v zápasu (e.g., Počet gólů v zápasu 2.5: + 2.5)
|
||||
m_goals = re.search(r"počet gólů v zápasu.*?:\s*([+-])\s*([\d.]+)", lower_line)
|
||||
if m_goals and "tým" not in lower_line: # Avoids team-specific goals unless implemented
|
||||
sign = m_goals.group(1)
|
||||
val = float(m_goals.group(2))
|
||||
is_over = sign == "+"
|
||||
return GoalAmount(ticketType=BetType.GOAL_AMOUNT, line=val, over=is_over, **base_args)
|
||||
|
||||
# Kdo postoupí
|
||||
if "postoupí" in lower_line or "postup" in lower_line:
|
||||
return Advance(ticketType=BetType.ADVANCED, **base_args)
|
||||
|
||||
# Handicap v zápasu
|
||||
m_hcp = re.search(r"handicap\s*(1|2)\s*:?\s*([+-]?[\d.]+)$", lower_line)
|
||||
if m_hcp:
|
||||
team_bet = m_hcp.group(1)
|
||||
val = float(m_hcp.group(2))
|
||||
return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=val, **base_args)
|
||||
|
||||
# Fallback for doubts, unknown formats (e.g., "Výsledek 1. poločasu"), or parsing failures
|
||||
return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=raw_text, **base_args)
|
||||
|
||||
|
||||
def classify(text: str) -> list[Bet]:
|
||||
"""Return a list of Bet objects parsed from OCR `text`.
|
||||
|
||||
Separates the text into chunks per bet based on date/time markers, then
|
||||
parses each chunk individually.
|
||||
"""
|
||||
text = (text or "").strip()
|
||||
if not text:
|
||||
return UnknownBet(
|
||||
return [
|
||||
UnknownBet(
|
||||
ticketType=BetType.UNKNOWN,
|
||||
team1Name="N/A",
|
||||
team2Name="N/A",
|
||||
@@ -49,146 +140,75 @@ def classify(text: str) -> Bet:
|
||||
league="N/A",
|
||||
raw_text="No text extracted",
|
||||
)
|
||||
]
|
||||
|
||||
# 1. Defaults & Normalization
|
||||
text_lower = text.lower()
|
||||
date_obj = datetime.datetime.now()
|
||||
team1, team2 = "Unknown", "Unknown"
|
||||
league = "Unknown"
|
||||
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||||
bets: list[Bet] = []
|
||||
|
||||
# 2. Heuristic extraction of Teams (Looking for "Team A - Team B" patterns)
|
||||
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
||||
for line in lines:
|
||||
if " - " in line or " vs " in line or " v " in line:
|
||||
# Avoid splitting on hyphens in dates or numbers
|
||||
if not re.search(r"\d\s*-\s*\d", line):
|
||||
parts = re.split(r" - | vs | v ", line)
|
||||
if len(parts) >= 2:
|
||||
team1, team2 = parts[0].strip(), parts[1].strip()
|
||||
break
|
||||
# Chunk the OCR output by looking for Date/Time markers
|
||||
# (e.g. "pá 6. 3. 2026 21:00", "dnes 18:30")
|
||||
blocks = []
|
||||
current_block = []
|
||||
date_pattern = re.compile(r"(\d{1,2}\.\s*\d{1,2}\.\s*\d{4}|\bdnes\b|\bzítra\b).*?\d{2}:\d{2}", re.IGNORECASE)
|
||||
|
||||
# 3. Heuristic extraction of Date (Looking for DD.MM. YYYY HH:MM)
|
||||
date_match = re.search(r"(\d{1,2}\.\s*\d{1,2}\.?\s*(?:\d{2,4})?)\s*(\d{1,2}:\d{2})?", text)
|
||||
if date_match:
|
||||
try:
|
||||
# Fallback to current year if missing, basic parse attempt
|
||||
date_str = f"{date_match.group(1).replace(' ', '')} {date_match.group(2) or '00:00'}"
|
||||
if len(date_str.split(".")[2]) <= 5: # Missing year
|
||||
date_str = date_str.replace(" ", f"{datetime.datetime.now().year} ")
|
||||
date_obj = datetime.datetime.strptime(date_str, "%d.%m.%Y %H:%M")
|
||||
except Exception:
|
||||
pass # Keep default if parsing fails
|
||||
for ln in lines:
|
||||
if date_pattern.search(ln):
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
current_block = [ln]
|
||||
else:
|
||||
current_block.append(ln)
|
||||
|
||||
# 4. Classification Logic based on keywords
|
||||
base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league}
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
|
||||
# Advance / Postup
|
||||
if any(kw in text_lower for kw in ["postup", "postoupí", "advance"]):
|
||||
return Advance(ticketType=BetType.ADVANCED, **base_args)
|
||||
if not blocks:
|
||||
blocks = [lines] # Fallback if regex missed the date boundaries
|
||||
|
||||
# Both Teams to Score / Oba dají gól
|
||||
if any(kw in text_lower for kw in ["oba dají gól", "btts", "oba týmy dají gól"]):
|
||||
return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args)
|
||||
# Parse each block into a separate Bet object
|
||||
for block in blocks:
|
||||
bets.append(_parse_block(block))
|
||||
|
||||
# Goal Amount (Over/Under)
|
||||
if any(kw in text_lower for kw in ["počet gólů", "více než", "méně než", "over", "under"]):
|
||||
# Attempt to find the goal line (e.g., 2.5, 3.5)
|
||||
line_match = re.search(r"(\d+\.\d+)", text)
|
||||
line_val = float(line_match.group(1)) if line_match else 2.5
|
||||
is_over = any(kw in text_lower for kw in ["více", "over", "+"])
|
||||
|
||||
return GoalAmount(ticketType=BetType.GOAL_AMOUNT, line=line_val, over=is_over, **base_args)
|
||||
|
||||
# Goal Handicap
|
||||
if any(kw in text_lower for kw in ["handicap", "hcp"]):
|
||||
hcp_match = re.search(r"([+-]?\d+\.\d+)", text)
|
||||
hcp_val = float(hcp_match.group(1)) if hcp_match else 0.0
|
||||
# Simplistic logic: guess team 1 if not explicitly stated
|
||||
team_bet = "2" if " 2 " in text else "1"
|
||||
|
||||
return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=hcp_val, **base_args)
|
||||
|
||||
# Win Draw Lose Double (1X, X2, 12)
|
||||
if any(kw in text_lower for kw in ["1x", "x2", "12", "dvojitá šance", "neprohra"]):
|
||||
bet_type = "01" if "1x" in text_lower else "02" if "x2" in text_lower else "12"
|
||||
return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=bet_type, **base_args)
|
||||
|
||||
# Win Lose (Draw no bet / Vítěz do rozhodnutí)
|
||||
if any(kw in text_lower for kw in ["bez remízy", "vítěz do rozhodnutí", "konečný vítěz"]):
|
||||
bet_type = "2" if re.search(r"\b2\b", text) else "1"
|
||||
return WinLose(ticketType=BetType.WIN_LOSE, betType=bet_type, **base_args)
|
||||
|
||||
# Win Draw Lose (Standard Match Odds)
|
||||
if any(kw in text_lower for kw in ["zápas", "výsledek zápasu", "1x2"]):
|
||||
# Look for isolated 1, X (or 0), or 2
|
||||
match_pick = re.search(r"\b(1|x|0|2)\b", text_lower)
|
||||
bet_type = match_pick.group(1).upper() if match_pick else "1"
|
||||
if bet_type == "X":
|
||||
bet_type = "0"
|
||||
|
||||
return WinDrawLose(ticketType=BetType.WIN_DRAW_LOSE, betType=bet_type, **base_args)
|
||||
|
||||
# Fallback Unknown
|
||||
return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=text, **base_args)
|
||||
return bets
|
||||
|
||||
|
||||
def img_classify(paths: list[str], ticket_id: int) -> Ticket:
|
||||
"""Given a path to an image and a date, return a list of Tickets that are
|
||||
relevant to that image and date."""
|
||||
# Define valid image extensions to ignore system files or text documents
|
||||
"""Given a list of file paths to images, classify each and collect bets into a Ticket."""
|
||||
ticket = Ticket(id=ticket_id, bets=[])
|
||||
valid_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".webp"}
|
||||
|
||||
# Iterate through all files in the folder
|
||||
for file in paths:
|
||||
file_path = Path(file)
|
||||
if file_path.is_file() and file_path.suffix.lower() in valid_extensions:
|
||||
# 1. Extract the text (called separately)
|
||||
if not (file_path.is_file() and file_path.suffix.lower() in valid_extensions):
|
||||
continue
|
||||
|
||||
extracted_text = img_to_text(str(file_path))
|
||||
print(f"--- Extracted Text from {file_path.name} ---")
|
||||
print(extracted_text)
|
||||
|
||||
# 2. Classify based on the extracted text (called separately)
|
||||
try:
|
||||
result = classify(extracted_text)
|
||||
except Exception as exc: # pragma: no cover - defensive fallback
|
||||
# Ensure result is always defined so downstream code cannot reference an unbound name
|
||||
print(f"classify() raised an exception: {exc}")
|
||||
result = UnknownBet(
|
||||
except Exception as exc:
|
||||
print(f"classify() error for {file_path}: {exc}")
|
||||
result = [
|
||||
UnknownBet(
|
||||
ticketType=BetType.UNKNOWN,
|
||||
team1Name="N/A",
|
||||
team2Name="N/A",
|
||||
date=datetime.datetime.now(),
|
||||
league="N/A",
|
||||
raw_text=extracted_text,
|
||||
date=datetime.datetime.now(),
|
||||
)
|
||||
]
|
||||
|
||||
# 3. Add the resulting tickets to our main list
|
||||
# Support classifier returning either a single Bet or a list of Bet
|
||||
if result is None:
|
||||
continue
|
||||
for bet in result:
|
||||
print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}")
|
||||
|
||||
if isinstance(result, list):
|
||||
for r in result:
|
||||
print(
|
||||
r.date,
|
||||
getattr(r, "ticketType", None),
|
||||
r.team1Name,
|
||||
r.team2Name,
|
||||
r.league,
|
||||
)
|
||||
ticket.bets.extend(result)
|
||||
else:
|
||||
print(
|
||||
result.date,
|
||||
getattr(result, "ticketType", None),
|
||||
result.team1Name,
|
||||
result.team2Name,
|
||||
result.league,
|
||||
)
|
||||
ticket.bets.append(result)
|
||||
|
||||
return ticket
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
img_classify(["./data/screenshots/2.png"], ticket_id=1)
|
||||
# Test script runner
|
||||
img_classify(["./data/screenshots/2.png"], ticket_id=2)
|
||||
|
||||
Reference in New Issue
Block a user