Image classifier debugged line separation and block separation
This commit is contained in:
@@ -37,6 +37,7 @@ def _parse_block(lines: list[str]) -> Bet:
|
|||||||
|
|
||||||
# 1. Date extraction
|
# 1. Date extraction
|
||||||
if lines:
|
if lines:
|
||||||
|
# Regex is forgiving of letters attached to numbers due to OCR (e.g., s07.3.2026)
|
||||||
date_m = re.search(r"(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})", lines[0])
|
date_m = re.search(r"(\d{1,2})\.\s*(\d{1,2})\.\s*(\d{4})", lines[0])
|
||||||
if date_m:
|
if date_m:
|
||||||
try:
|
try:
|
||||||
@@ -53,17 +54,17 @@ def _parse_block(lines: list[str]) -> Bet:
|
|||||||
|
|
||||||
# 3. League extraction (typically contains a slash and sport name)
|
# 3. League extraction (typically contains a slash and sport name)
|
||||||
for ln in lines:
|
for ln in lines:
|
||||||
if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis"]):
|
if "/" in ln and any(sport in ln for sport in ["Fotbal", "Hokej", "Tenis", "Basketbal"]):
|
||||||
league = ln.strip()
|
league = ln.strip()
|
||||||
break
|
break
|
||||||
|
|
||||||
base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league}
|
base_args = {"team1Name": team1, "team2Name": team2, "date": date_obj, "league": league}
|
||||||
|
|
||||||
# 4. Bet Type Classification (Strict parsing per tickety.md)
|
# 4. Bet Type Classification
|
||||||
for ln in lines:
|
for ln in lines:
|
||||||
lower_line = ln.lower()
|
lower_line = ln.lower()
|
||||||
|
|
||||||
# Výsledek zápasu (1X2) - strict match to avoid "Výsledek 1. poločasu"
|
# Výsledek zápasu (1X2)
|
||||||
m_vysl = re.search(r"výsledek zápasu\s*:?\s*(1|0|x|2)$", lower_line)
|
m_vysl = re.search(r"výsledek zápasu\s*:?\s*(1|0|x|2)$", lower_line)
|
||||||
if m_vysl and "dvojtip" not in lower_line and "remízy" not in lower_line:
|
if m_vysl and "dvojtip" not in lower_line and "remízy" not in lower_line:
|
||||||
pick = m_vysl.group(1).upper()
|
pick = m_vysl.group(1).upper()
|
||||||
@@ -75,7 +76,6 @@ def _parse_block(lines: list[str]) -> Bet:
|
|||||||
m_dvoj = re.search(r"výsledek zápasu - dvojtip\s*:?\s*(10|01|02|20|12|1x|x1|x2|2x)$", lower_line)
|
m_dvoj = re.search(r"výsledek zápasu - dvojtip\s*:?\s*(10|01|02|20|12|1x|x1|x2|2x)$", lower_line)
|
||||||
if m_dvoj:
|
if m_dvoj:
|
||||||
pick = m_dvoj.group(1).replace("x", "0").replace("X", "0")
|
pick = m_dvoj.group(1).replace("x", "0").replace("X", "0")
|
||||||
# Normalize combinations to match literal definitions in ticket.py
|
|
||||||
if pick in ["10", "01"]:
|
if pick in ["10", "01"]:
|
||||||
pick = "01"
|
pick = "01"
|
||||||
elif pick in ["20", "02"]:
|
elif pick in ["20", "02"]:
|
||||||
@@ -86,7 +86,7 @@ def _parse_block(lines: list[str]) -> Bet:
|
|||||||
if pick in ["01", "12", "02"]:
|
if pick in ["01", "12", "02"]:
|
||||||
return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=pick, **base_args)
|
return WinDrawLoseDouble(ticketType=BetType.WIN_DRAW_LOSE_DOUBLE, betType=pick, **base_args)
|
||||||
|
|
||||||
# Výsledek zápasu bez remízy (Sázka bez remízy)
|
# Výsledek zápasu bez remízy
|
||||||
m_bez = re.search(r"bez remízy\s*:?\s*(1|2)$", lower_line)
|
m_bez = re.search(r"bez remízy\s*:?\s*(1|2)$", lower_line)
|
||||||
if m_bez:
|
if m_bez:
|
||||||
return WinLose(ticketType=BetType.WIN_LOSE, betType=m_bez.group(1), **base_args)
|
return WinLose(ticketType=BetType.WIN_LOSE, betType=m_bez.group(1), **base_args)
|
||||||
@@ -97,12 +97,11 @@ def _parse_block(lines: list[str]) -> Bet:
|
|||||||
if m_btts.group(1) == "ano":
|
if m_btts.group(1) == "ano":
|
||||||
return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args)
|
return BothTeamScored(ticketType=BetType.BOTH_TEAM_SCORED, **base_args)
|
||||||
else:
|
else:
|
||||||
# `ticket.py` does not currently define resolution for "Ne" logic
|
|
||||||
break
|
break
|
||||||
|
|
||||||
# Počet gólů v zápasu (e.g., Počet gólů v zápasu 2.5: + 2.5)
|
# Počet gólů v zápasu
|
||||||
m_goals = re.search(r"počet gólů v zápasu.*?:\s*([+-])\s*([\d.]+)", lower_line)
|
m_goals = re.search(r"počet gólů v zápasu.*?:\s*([+-])\s*([\d.]+)", lower_line)
|
||||||
if m_goals and "tým" not in lower_line: # Avoids team-specific goals unless implemented
|
if m_goals and "tým" not in lower_line:
|
||||||
sign = m_goals.group(1)
|
sign = m_goals.group(1)
|
||||||
val = float(m_goals.group(2))
|
val = float(m_goals.group(2))
|
||||||
is_over = sign == "+"
|
is_over = sign == "+"
|
||||||
@@ -119,7 +118,7 @@ def _parse_block(lines: list[str]) -> Bet:
|
|||||||
val = float(m_hcp.group(2))
|
val = float(m_hcp.group(2))
|
||||||
return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=val, **base_args)
|
return GoalHandicap(ticketType=BetType.GOAL_HANDICAP, team_bet=team_bet, handicap_amount=val, **base_args)
|
||||||
|
|
||||||
# Fallback for doubts, unknown formats (e.g., "Výsledek 1. poločasu"), or parsing failures
|
# Fallback
|
||||||
return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=raw_text, **base_args)
|
return UnknownBet(ticketType=BetType.UNKNOWN, raw_text=raw_text, **base_args)
|
||||||
|
|
||||||
|
|
||||||
@@ -143,29 +142,52 @@ def classify(text: str) -> list[Bet]:
|
|||||||
|
|
||||||
blocks = []
|
blocks = []
|
||||||
current_block = []
|
current_block = []
|
||||||
|
in_block = False
|
||||||
|
|
||||||
# Sports that Fortuna commonly uses as block terminators
|
# START trigger: Looks for 'dnes', 'zítra', or 'DD.MM.'
|
||||||
# We look for the line that contains e.g., "Fotbal / 1. Španělsko"
|
date_start_pattern = re.compile(r"(\d{1,2}\.\s*\d{1,2}\.|\b(dnes|zítra|zitra|včera|vcera)\b)", re.IGNORECASE)
|
||||||
sport_delimiter_pattern = re.compile(
|
|
||||||
r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE
|
# END trigger: Looks for standard Fortuna sport prefixes
|
||||||
)
|
sport_end_pattern = re.compile(r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE)
|
||||||
|
|
||||||
for ln in lines:
|
for ln in lines:
|
||||||
current_block.append(ln)
|
is_start = date_start_pattern.search(ln)
|
||||||
|
is_end = sport_end_pattern.match(ln)
|
||||||
|
|
||||||
# If the line starts with "Fotbal /" (or other sports), it is the LAST line of a bet block
|
if is_start:
|
||||||
if sport_delimiter_pattern.match(ln):
|
# If we somehow hit a start while already in a block (missing end marker fallback),
|
||||||
|
# save the current block before starting a new one.
|
||||||
|
if current_block:
|
||||||
|
print(f"ERROR: SUS block was not properly ended and new block start detected: '{ln}'")
|
||||||
|
blocks.append(current_block)
|
||||||
|
current_block = [ln]
|
||||||
|
in_block = True
|
||||||
|
|
||||||
|
elif is_end:
|
||||||
|
# We hit the league/sport line. Add it, save the block, and close the window.
|
||||||
|
current_block.append(ln)
|
||||||
blocks.append(current_block)
|
blocks.append(current_block)
|
||||||
current_block = []
|
current_block = []
|
||||||
|
in_block = False
|
||||||
|
|
||||||
# Catch any dangling lines at the end that didn't have a clean sport delimiter
|
elif in_block:
|
||||||
if current_block and len(current_block) > 1:
|
# We are inside a block, gathering standard match info (teams, bet types).
|
||||||
|
current_block.append(ln)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# We are outside a block. This is noise (e.g. "© osmifinále / 2.zápas 0:1" or "170").
|
||||||
|
# We simply ignore it and do nothing.
|
||||||
|
print(f"INFO: Ignoring line outside of any block: '{ln}'")
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Catch any dangling block at the very end of the document
|
||||||
|
if current_block:
|
||||||
blocks.append(current_block)
|
blocks.append(current_block)
|
||||||
|
|
||||||
# Parse each block into a separate Bet object
|
# Parse each block into a separate Bet object
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
print(f"--- Parsing Block ---\n{block}\n--- End Block ---")
|
if len(block) > 1: # Ensure the block has enough lines to be valid
|
||||||
bets.append(_parse_block(block))
|
bets.append(_parse_block(block))
|
||||||
|
|
||||||
return bets
|
return bets
|
||||||
|
|
||||||
@@ -183,9 +205,10 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:
|
|||||||
|
|
||||||
extracted_text = img_to_text(str(file_path))
|
extracted_text = img_to_text(str(file_path))
|
||||||
print(f"--- Extracted Text from {file_path.name} ---")
|
print(f"--- Extracted Text from {file_path.name} ---")
|
||||||
print(extracted_text)
|
# print(extracted_text)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
print("--- START OF Classification Result ---")
|
||||||
result = classify(extracted_text)
|
result = classify(extracted_text)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"classify() error for {file_path}: {exc}")
|
print(f"classify() error for {file_path}: {exc}")
|
||||||
@@ -200,8 +223,8 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:
|
|||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
for bet in result:
|
# for bet in result:
|
||||||
print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}")
|
# print(f"-> Parsed: {bet.ticketType.value} | {bet.team1Name} vs {bet.team2Name} | {bet.league}")
|
||||||
|
|
||||||
ticket.bets.extend(result)
|
ticket.bets.extend(result)
|
||||||
|
|
||||||
@@ -210,4 +233,4 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Test script runner
|
# Test script runner
|
||||||
img_classify(["./data/screenshots/2.png"], ticket_id=2)
|
img_classify(["./data/screenshots/26.png", "./data/screenshots/27.png"], ticket_id=2)
|
||||||
|
|||||||
Reference in New Issue
Block a user