Image classifier debugging block separation and line separation
This commit is contained in:
@@ -124,11 +124,7 @@ def _parse_block(lines: list[str]) -> Bet:
|
|||||||
|
|
||||||
|
|
||||||
def classify(text: str) -> list[Bet]:
|
def classify(text: str) -> list[Bet]:
|
||||||
"""Return a list of Bet objects parsed from OCR `text`.
|
"""Return a list of Bet objects parsed from OCR `text`."""
|
||||||
|
|
||||||
Separates the text into chunks per bet based on date/time markers, then
|
|
||||||
parses each chunk individually.
|
|
||||||
"""
|
|
||||||
text = (text or "").strip()
|
text = (text or "").strip()
|
||||||
if not text:
|
if not text:
|
||||||
return [
|
return [
|
||||||
@@ -145,28 +141,30 @@ def classify(text: str) -> list[Bet]:
|
|||||||
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||||||
bets: list[Bet] = []
|
bets: list[Bet] = []
|
||||||
|
|
||||||
# Chunk the OCR output by looking for Date/Time markers
|
|
||||||
# (e.g. "pá 6. 3. 2026 21:00", "dnes 18:30")
|
|
||||||
blocks = []
|
blocks = []
|
||||||
current_block = []
|
current_block = []
|
||||||
date_pattern = re.compile(r"(\d{1,2}\.\s*\d{1,2}\.\s*\d{4}|\bdnes\b|\bzítra\b).*?\d{2}:\d{2}", re.IGNORECASE)
|
|
||||||
|
# Sports that Fortuna commonly uses as block terminators
|
||||||
|
# We look for the line that contains e.g., "Fotbal / 1. Španělsko"
|
||||||
|
sport_delimiter_pattern = re.compile(
|
||||||
|
r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
for ln in lines:
|
for ln in lines:
|
||||||
if date_pattern.search(ln):
|
|
||||||
if current_block:
|
|
||||||
blocks.append(current_block)
|
|
||||||
current_block = [ln]
|
|
||||||
else:
|
|
||||||
current_block.append(ln)
|
current_block.append(ln)
|
||||||
|
|
||||||
if current_block:
|
# If the line starts with "Fotbal /" (or other sports), it is the LAST line of a bet block
|
||||||
|
if sport_delimiter_pattern.match(ln):
|
||||||
blocks.append(current_block)
|
blocks.append(current_block)
|
||||||
|
current_block = []
|
||||||
|
|
||||||
if not blocks:
|
# Catch any dangling lines at the end that didn't have a clean sport delimiter
|
||||||
blocks = [lines] # Fallback if regex missed the date boundaries
|
if current_block and len(current_block) > 1:
|
||||||
|
blocks.append(current_block)
|
||||||
|
|
||||||
# Parse each block into a separate Bet object
|
# Parse each block into a separate Bet object
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
|
print(f"--- Parsing Block ---\n{block}\n--- End Block ---")
|
||||||
bets.append(_parse_block(block))
|
bets.append(_parse_block(block))
|
||||||
|
|
||||||
return bets
|
return bets
|
||||||
@@ -180,6 +178,7 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:
|
|||||||
for file in paths:
|
for file in paths:
|
||||||
file_path = Path(file)
|
file_path = Path(file)
|
||||||
if not (file_path.is_file() and file_path.suffix.lower() in valid_extensions):
|
if not (file_path.is_file() and file_path.suffix.lower() in valid_extensions):
|
||||||
|
print(f"Skipping invalid file: {file}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
extracted_text = img_to_text(str(file_path))
|
extracted_text = img_to_text(str(file_path))
|
||||||
|
|||||||
Reference in New Issue
Block a user