Image classifier debugged line separation and block separation

This commit is contained in:
Chlupaty
2026-03-22 15:58:23 +01:00
parent 8edd016e39
commit 60bbc09ccc

View File

@@ -37,6 +37,7 @@ def _parse_block(lines: list[str]) -> Bet:
# 1. Date extraction
if lines:
# Regex is forgiving of letters attached to numbers due to OCR (e.g., s07.3.2026)
date_m = re.search(r"(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})", lines[0])
if date_m:
try:
@@ -53,17 +54,17 @@ def _parse_block(lines: list[str]) -> Bet:
# 3. League extraction (typically contains a slash and sport name)
for ln in lines:
if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis"]):
if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis", "Basketbal"]):
league = ln.strip()
break
base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league}
# 4. Bet Type Classification (Strict parsing per tickety.md)
# 4. Bet Type Classification
for ln in lines:
lower_line = ln.lower()
# Výsledek zápasu (1X2) - strict match to avoid "Výsledek 1. poločasu"
# Výsledek zápasu (1X2)
m_vysl = re.search(r"výsledek zápasu\s*:?\s*(1|0|x|2)$", lower_line)
if m_vysl and "dvojtip" not in lower_line and "remízy" not in lower_line:
pick = m_vysl.group(1).upper()
@@ -75,7 +76,6 @@ def _parse_block(lines: list[str]) -> Bet:
m_dvoj = re.search(r"výsledek zápasu - dvojtip\s*:?\s*(10|01|02|20|12|1x|x1|x2|2x)$", lower_line)
if m_dvoj:
pick = m_dvoj.group(1).replace("x", "0").replace("X", "0")
# Normalize combinations to match literal definitions in ticket.py
if pick in ["10", "01"]:
pick = "01"
elif pick in ["20", "02"]:
@@ -86,7 +86,7 @@ def _parse_block(lines: list[str]) -> Bet:
if pick in ["01", "12", "02"]:
return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=pick, **base_args)
# Výsledek zápasu bez remízy (Sázka bez remízy)
# Výsledek zápasu bez remízy
m_bez = re.search(r"bez remízy\s*:?\s*(1|2)$", lower_line)
if m_bez:
return WinLose(ticketType=BetType.WIN_LOSE, betType=m_bez.group(1), **base_args)
@@ -97,12 +97,11 @@ def _parse_block(lines: list[str]) -> Bet:
if m_btts.group(1) == "ano":
return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args)
else:
# `ticket.py` does not currently define resolution for "Ne" logic
break
# Počet gólů v zápasu (e.g., Počet gólů v zápasu 2.5: + 2.5)
# Počet gólů v zápasu
m_goals = re.search(r"počet gólů v zápasu.*?:\s*([+-])\s*([\d.]+)", lower_line)
if m_goals and "tým" not in lower_line: # Avoids team-specific goals unless implemented
if m_goals and "tým" not in lower_line:
sign = m_goals.group(1)
val = float(m_goals.group(2))
is_over = sign == "+"
@@ -119,7 +118,7 @@ def _parse_block(lines: list[str]) -> Bet:
val = float(m_hcp.group(2))
return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=val, **base_args)
# Fallback for doubts, unknown formats (e.g., "Výsledek 1. poločasu"), or parsing failures
# Fallback
return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=raw_text, **base_args)
@@ -143,28 +142,51 @@ def classify(text: str) -> list[Bet]:
blocks = []
current_block = []
in_block = False
# Sports that Fortuna commonly uses as block terminators
# We look for the line that contains e.g., "Fotbal / 1. Španělsko"
sport_delimiter_pattern = re.compile(
r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE
)
# START trigger: Looks for 'dnes', 'zítra', or 'DD.MM.'
date_start_pattern = re.compile(r"(\d{1,2}\.\s*\d{1,2}\.|\b(dnes|zítra|zitra|včera|vcera)\b)", re.IGNORECASE)
# END trigger: Looks for standard Fortuna sport prefixes
sport_end_pattern = re.compile(r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE)
for ln in lines:
current_block.append(ln)
is_start = date_start_pattern.search(ln)
is_end = sport_end_pattern.match(ln)
# If the line starts with "Fotbal /" (or other sports), it is the LAST line of a bet block
if sport_delimiter_pattern.match(ln):
if is_start:
# If we somehow hit a start while already in a block (missing end marker fallback),
# save the current block before starting a new one.
if current_block:
print(f"ERROR: SUS block was not properly ended and new block start detected: '{ln}'")
blocks.append(current_block)
current_block = [ln]
in_block = True
elif is_end:
# We hit the league/sport line. Add it, save the block, and close the window.
current_block.append(ln)
blocks.append(current_block)
current_block = []
in_block = False
# Catch any dangling lines at the end that didn't have a clean sport delimiter
if current_block and len(current_block) > 1:
elif in_block:
# We are inside a block, gathering standard match info (teams, bet types).
current_block.append(ln)
else:
# We are outside a block. This is noise (e.g. "© osmifinále / 2.zápas 0:1" or "170").
# We simply ignore it and do nothing.
print(f"INFO: Ignoring line outside of any block: '{ln}'")
pass
# Catch any dangling block at the very end of the document
if current_block:
blocks.append(current_block)
# Parse each block into a separate Bet object
for block in blocks:
print(f"--- Parsing Block ---\n{block}\n--- End Block ---")
if len(block) > 1: # Ensure the block has enough lines to be valid
bets.append(_parse_block(block))
return bets
@@ -183,9 +205,10 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:
extracted_text = img_to_text(str(file_path))
print(f"--- Extracted Text from {file_path.name} ---")
print(extracted_text)
# print(extracted_text)
try:
print("--- START OF Classification Result ---")
result = classify(extracted_text)
except Exception as exc:
print(f"classify() error for {file_path}: {exc}")
@@ -200,8 +223,8 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:
)
]
for bet in result:
print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}")
# for bet in result:
# print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}")
ticket.bets.extend(result)
@@ -210,4 +233,4 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:
if __name__ == "__main__":
# Test script runner
img_classify(["./data/screenshots/2.png"], ticket_id=2)
img_classify(["./data/screenshots/26.png", "./data/screenshots/27.png"], ticket_id=2)