This commit is contained in:
2026-03-25 19:04:17 +01:00
parent ebdf4f1572
commit d5211572a5
8 changed files with 67 additions and 39 deletions

View File

@@ -62,3 +62,5 @@ resolver:
img_classifier: img_classifier:
target_path: data/screenshots/ target_path: data/screenshots/
log_level: INFO # set to DEBUG to see raw classifier and resolver output

View File

@@ -1,5 +1,14 @@
from __future__ import annotations from __future__ import annotations
import logging
_logger = logging.getLogger("beaky")
def log(text: str) -> None:
"""Emit a (possibly ANSI-colored) message at DEBUG level."""
_logger.debug("%s", text)
def bold(text: str) -> str: def bold(text: str) -> str:
return f"\033[1m{text}\033[0m" return f"\033[1m{text}\033[0m"

View File

@@ -11,3 +11,4 @@ class Config:
screenshotter: ScreenshotterConfig screenshotter: ScreenshotterConfig
resolver: ResolverConfig resolver: ResolverConfig
img_classifier: ImgClassifierConfig img_classifier: ImgClassifierConfig
log_level: str = "INFO"

View File

@@ -1,9 +1,12 @@
import datetime import datetime
import logging
import re import re
from pathlib import Path from pathlib import Path
import pytesseract import pytesseract
logger = logging.getLogger(__name__)
from beaky.datamodels.ticket import ( from beaky.datamodels.ticket import (
Advance, Advance,
Bet, Bet,
@@ -24,7 +27,7 @@ def img_to_text(path: str) -> str:
try: try:
return pytesseract.image_to_string(path, lang="ces").strip() return pytesseract.image_to_string(path, lang="ces").strip()
except Exception as e: except Exception as e:
print(f"Error processing {path}: {e}") logger.error("Error processing %s: %s", path, e)
return "" return ""
@@ -151,7 +154,7 @@ def classify(text: str) -> list[Bet]:
sport_end_pattern = re.compile(r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE) sport_end_pattern = re.compile(r"^(Fotbal|Hokej|Tenis|Basketbal|Florbal|Volejbal|E-sport|Šipky)\s*/", re.IGNORECASE)
for ln in lines: for ln in lines:
print(f"Processing line: '{ln}'") logger.debug("Processing line: '%s'", ln)
is_start = date_start_pattern.search(ln) is_start = date_start_pattern.search(ln)
is_end = sport_end_pattern.match(ln) is_end = sport_end_pattern.match(ln)
@@ -159,7 +162,7 @@ def classify(text: str) -> list[Bet]:
# If we somehow hit a start while already in a block (missing end marker fallback), # If we somehow hit a start while already in a block (missing end marker fallback),
# save the current block before starting a new one. # save the current block before starting a new one.
if current_block: if current_block:
print(f"ERROR: SUS block was not properly ended and new block start detected: '{ln}'") logger.warning("Block not properly ended, new block start detected: '%s'", ln)
blocks.append(current_block) blocks.append(current_block)
current_block = [ln] current_block = [ln]
in_block = True in_block = True
@@ -178,7 +181,7 @@ def classify(text: str) -> list[Bet]:
else: else:
# We are outside a block. This is noise (e.g. "© osmifinále / 2.zápas 0:1" or "170"). # We are outside a block. This is noise (e.g. "© osmifinále / 2.zápas 0:1" or "170").
# We simply ignore it and do nothing. # We simply ignore it and do nothing.
print(f"INFO: Ignoring line outside of any block: '{ln}'") logger.debug("Ignoring line outside of any block: '%s'", ln)
pass pass
# Catch any dangling block at the very end of the document # Catch any dangling block at the very end of the document
@@ -201,18 +204,16 @@ def img_classify(paths: list[str], ticket_id: int) -> Ticket:
for file in paths: for file in paths:
file_path = Path(file) file_path = Path(file)
if not (file_path.is_file() and file_path.suffix.lower() in valid_extensions): if not (file_path.is_file() and file_path.suffix.lower() in valid_extensions):
print(f"Skipping invalid file: {file}") logger.warning("Skipping invalid file: %s", file)
continue continue
extracted_text = img_to_text(str(file_path)) extracted_text = img_to_text(str(file_path))
print(f"--- Extracted Text from {file_path.name} ---") logger.debug("Extracted text from %s", file_path.name)
# print(extracted_text)
try: try:
print("--- START OF Classification Result ---")
result = classify(extracted_text) result = classify(extracted_text)
except Exception as exc: except Exception as exc:
print(f"classify() error for {file_path}: {exc}") logger.error("classify() error for %s: %s", file_path, exc)
result = [ result = [
UnknownBet( UnknownBet(
ticketType=BetType.UNKNOWN, ticketType=BetType.UNKNOWN,

View File

@@ -1,9 +1,12 @@
import logging
import re import re
from datetime import datetime from datetime import datetime
from typing import Any from typing import Any
from playwright.sync_api import Page, sync_playwright from playwright.sync_api import Page, sync_playwright
logger = logging.getLogger(__name__)
from beaky.datamodels.ticket import ( from beaky.datamodels.ticket import (
Bet, Bet,
BetType, BetType,
@@ -151,7 +154,7 @@ class LinkClassifier:
page.wait_for_timeout(500) page.wait_for_timeout(500)
result = Ticket(id=link.id, bets=_extract_legs(page, link.date)) result = Ticket(id=link.id, bets=_extract_legs(page, link.date))
except Exception as e: except Exception as e:
print(f"Error classifying link {link.id}: {e}") logger.error("Error classifying link %d: %s", link.id, e)
finally: finally:
page.close() page.close()
browser.close() browser.close()

View File

@@ -1,10 +1,14 @@
import logging
import time import time
from dataclasses import dataclass, field from dataclasses import field
from datetime import date, datetime, timedelta from datetime import date, datetime, timedelta
from difflib import SequenceMatcher from difflib import SequenceMatcher
from enum import Enum from enum import Enum
from typing import Any from typing import Any
from pydantic import ConfigDict, SerializeAsAny
from pydantic.dataclasses import dataclass
import diskcache import diskcache
import requests import requests
@@ -18,6 +22,8 @@ from beaky.datamodels.ticket import (
) )
from beaky.resolvers.config import ResolverConfig from beaky.resolvers.config import ResolverConfig
logger = logging.getLogger(__name__)
_API_BASE = "https://v3.football.api-sports.io" _API_BASE = "https://v3.football.api-sports.io"
_DATE_WINDOW = 3 # days either side of the bet date to search _DATE_WINDOW = 3 # days either side of the bet date to search
@@ -30,9 +36,9 @@ class TicketVerdict(str, Enum):
UNKNOWN = "unknown — could not resolve enough bets to decide" UNKNOWN = "unknown — could not resolve enough bets to decide"
@dataclass @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class ResolvedBet: class ResolvedBet:
bet: Bet bet: SerializeAsAny[Bet]
outcome: BetOutcome outcome: BetOutcome
fixture_id: int | None = None fixture_id: int | None = None
# Confidence breakdown (each component 0.01.0): # Confidence breakdown (each component 0.01.0):
@@ -73,11 +79,11 @@ def _get(url: str, headers: dict[str, str], params: dict[str, str | int], retrie
resp = requests.get(url, headers=headers, params=params) resp = requests.get(url, headers=headers, params=params)
if resp.status_code == 429: if resp.status_code == 429:
wait = backoff * (attempt + 1) wait = backoff * (attempt + 1)
print(f" !! rate limited — waiting {wait:.0f}s before retry ({attempt + 1}/{retries})") logger.warning("rate limited — waiting %.0fs before retry (%d/%d)", wait, attempt + 1, retries)
time.sleep(wait) time.sleep(wait)
continue continue
return resp return resp
print(f" !! still rate limited after {retries} retries, giving up") logger.warning("still rate limited after %d retries, giving up", retries)
return resp return resp
@@ -99,17 +105,17 @@ class TicketResolver:
def _resolve_bet(self, bet: Bet) -> ResolvedBet: def _resolve_bet(self, bet: Bet) -> ResolvedBet:
bet_type = type(bet).__name__ bet_type = type(bet).__name__
print(f"\n {_ansi.bold(_ansi.cyan(f'┌─ [{bet_type}]'))} {_ansi.bold(f'{bet.team1Name} vs {bet.team2Name}')}" _ansi.log(f"\n {_ansi.bold(_ansi.cyan(f'┌─ [{bet_type}]'))} {_ansi.bold(f'{bet.team1Name} vs {bet.team2Name}')}"
f" {_ansi.dim(f'{bet.date.strftime('%Y-%m-%d')} | {bet.league}')}") f" {_ansi.dim(f'{bet.date.strftime("%Y-%m-%d")} | {bet.league}')}")
if isinstance(bet, UnknownBet): if isinstance(bet, UnknownBet):
print(_ansi.gray(f" │ skipping — not implemented: {bet.raw_text!r}")) _ansi.log(_ansi.gray(f" │ skipping — not implemented: {bet.raw_text!r}"))
print(_ansi.gray(" └─ UNKNOWN")) _ansi.log(_ansi.gray(" └─ UNKNOWN"))
return ResolvedBet(bet=bet, outcome=BetOutcome.UNKNOWN) return ResolvedBet(bet=bet, outcome=BetOutcome.UNKNOWN)
fixture, name_match, date_prox, league_conf = self._find_fixture(bet) fixture, name_match, date_prox, league_conf = self._find_fixture(bet)
if fixture is None: if fixture is None:
print(_ansi.gray(" └─ UNKNOWN — no fixture found")) _ansi.log(_ansi.gray(" └─ UNKNOWN — no fixture found"))
return ResolvedBet(bet=bet, outcome=BetOutcome.UNKNOWN, league_found=league_conf) return ResolvedBet(bet=bet, outcome=BetOutcome.UNKNOWN, league_found=league_conf)
home_name = fixture["teams"]["home"]["name"] home_name = fixture["teams"]["home"]["name"]
@@ -126,12 +132,12 @@ class TicketResolver:
outcome = BetOutcome.UNKNOWN outcome = BetOutcome.UNKNOWN
goals = fixture["goals"] goals = fixture["goals"]
print(_ansi.dim( _ansi.log(_ansi.dim(
f" │ matched #{fixture['fixture']['id']}: {home_name} vs {away_name}" f" │ matched #{fixture['fixture']['id']}: {home_name} vs {away_name}"
f" | {goals['home']}:{goals['away']} | {fixture['fixture']['status']['short']}" f" | {goals['home']}:{goals['away']} | {fixture['fixture']['status']['short']}"
f" | confidence {confidence} (name={name_match:.2f} date={date_prox:.2f} league={league_conf} finished={finished})" f" | confidence {confidence} (name={name_match:.2f} date={date_prox:.2f} league={league_conf} finished={finished})"
)) ))
print(_ansi.bold(_ansi.green(f" └─ {outcome.value.upper()}") if outcome == BetOutcome.WIN _ansi.log(_ansi.bold(_ansi.green(f" └─ {outcome.value.upper()}") if outcome == BetOutcome.WIN
else _ansi.red(f" └─ {outcome.value.upper()}") if outcome == BetOutcome.LOSE else _ansi.red(f" └─ {outcome.value.upper()}") if outcome == BetOutcome.LOSE
else _ansi.yellow(f" └─ {outcome.value.upper()}") if outcome == BetOutcome.VOID else _ansi.yellow(f" └─ {outcome.value.upper()}") if outcome == BetOutcome.VOID
else _ansi.gray(f" └─ {outcome.value.upper()}"))) else _ansi.gray(f" └─ {outcome.value.upper()}")))
@@ -151,9 +157,9 @@ class TicketResolver:
def _get_statistics(self, fixture_id: int) -> list[dict[str, Any]]: def _get_statistics(self, fixture_id: int) -> list[dict[str, Any]]:
cache_key = ("stats", fixture_id) cache_key = ("stats", fixture_id)
if cache_key in self._disk_cache: if cache_key in self._disk_cache:
print(_ansi.gray(f" │ /fixtures/statistics served from disk cache (fixture={fixture_id})")) _ansi.log(_ansi.gray(f" │ /fixtures/statistics served from disk cache (fixture={fixture_id})"))
return self._disk_cache[cache_key] # type: ignore[no-any-return] return self._disk_cache[cache_key] # type: ignore[no-any-return]
print(_ansi.gray(f" │ GET /fixtures/statistics fixture={fixture_id}")) _ansi.log(_ansi.gray(f" │ GET /fixtures/statistics fixture={fixture_id}"))
resp = _get(f"{_API_BASE}/fixtures/statistics", headers=self._headers, params={"fixture": fixture_id}) resp = _get(f"{_API_BASE}/fixtures/statistics", headers=self._headers, params={"fixture": fixture_id})
resp.raise_for_status() resp.raise_for_status()
stats = resp.json().get("response", []) stats = resp.json().get("response", [])
@@ -173,7 +179,7 @@ class TicketResolver:
if cache_key not in self._fixture_cache: if cache_key not in self._fixture_cache:
if cache_key in self._disk_cache and not cache_may_be_stale: if cache_key in self._disk_cache and not cache_may_be_stale:
self._fixture_cache[cache_key] = self._disk_cache[cache_key] self._fixture_cache[cache_key] = self._disk_cache[cache_key]
print(_ansi.gray(f" │ /fixtures served from disk cache ({len(self._fixture_cache[cache_key])} fixtures)")) _ansi.log(_ansi.gray(f" │ /fixtures served from disk cache ({len(self._fixture_cache[cache_key])} fixtures)"))
else: else:
date_from = (center - timedelta(days=_DATE_WINDOW)).strftime("%Y-%m-%d") date_from = (center - timedelta(days=_DATE_WINDOW)).strftime("%Y-%m-%d")
date_to = (center + timedelta(days=_DATE_WINDOW)).strftime("%Y-%m-%d") date_to = (center + timedelta(days=_DATE_WINDOW)).strftime("%Y-%m-%d")
@@ -181,17 +187,17 @@ class TicketResolver:
if league_id is not None: if league_id is not None:
params["league"] = league_id params["league"] = league_id
params["season"] = center.year if center.month >= 7 else center.year - 1 params["season"] = center.year if center.month >= 7 else center.year - 1
print(_ansi.gray(f" │ GET /fixtures {params}")) _ansi.log(_ansi.gray(f" │ GET /fixtures {params}"))
resp = _get(f"{_API_BASE}/fixtures", headers=self._headers, params=params) resp = _get(f"{_API_BASE}/fixtures", headers=self._headers, params=params)
resp.raise_for_status() resp.raise_for_status()
self._fixture_cache[cache_key] = resp.json().get("response", []) self._fixture_cache[cache_key] = resp.json().get("response", [])
print(_ansi.gray(f"{len(self._fixture_cache[cache_key])} fixtures returned")) _ansi.log(_ansi.gray(f"{len(self._fixture_cache[cache_key])} fixtures returned"))
cacheable = [f for f in self._fixture_cache[cache_key] if f.get("fixture", {}).get("status", {}).get("short") != "NS"] cacheable = [f for f in self._fixture_cache[cache_key] if f.get("fixture", {}).get("status", {}).get("short") != "NS"]
if cacheable: if cacheable:
self._disk_cache[cache_key] = cacheable self._disk_cache[cache_key] = cacheable
print(_ansi.gray(f"{len(cacheable)} non-NS fixture(s) written to disk cache")) _ansi.log(_ansi.gray(f"{len(cacheable)} non-NS fixture(s) written to disk cache"))
else: else:
print(_ansi.gray(f" │ /fixtures (±{_DATE_WINDOW}d of {date_str}, league={league_id}) served from memory")) _ansi.log(_ansi.gray(f" │ /fixtures (±{_DATE_WINDOW}d of {date_str}, league={league_id}) served from memory"))
fixture, name_match, date_prox = _best_fixture_match( fixture, name_match, date_prox = _best_fixture_match(
self._fixture_cache[cache_key], bet.team1Name, bet.team2Name, center self._fixture_cache[cache_key], bet.team1Name, bet.team2Name, center
@@ -210,22 +216,22 @@ class TicketResolver:
default=(None, None), default=(None, None),
) )
if best_id is not None: if best_id is not None:
print(_ansi.gray(f" │ league {league_name!r} -> id={best_id} (static map, pattern={best_pattern!r})")) _ansi.log(_ansi.gray(f" │ league {league_name!r} -> id={best_id} (static map, pattern={best_pattern!r})"))
self._league_cache[key] = (best_id, 1.0) self._league_cache[key] = (best_id, 1.0)
return best_id, 1.0 return best_id, 1.0
# Fall back to API search — lower confidence since first result is taken unverified # Fall back to API search — lower confidence since first result is taken unverified
print(_ansi.gray(f" │ GET /leagues search={league_name!r}")) _ansi.log(_ansi.gray(f" │ GET /leagues search={league_name!r}"))
resp = _get(f"{_API_BASE}/leagues", headers=self._headers, params={"search": league_name[:20]}) resp = _get(f"{_API_BASE}/leagues", headers=self._headers, params={"search": league_name[:20]})
results = resp.json().get("response", []) results = resp.json().get("response", [])
if results: if results:
league_id = results[0]["league"]["id"] league_id = results[0]["league"]["id"]
league_found_name = results[0]["league"]["name"] league_found_name = results[0]["league"]["name"]
print(_ansi.gray(f" │ matched {league_found_name!r} id={league_id} (API fallback, confidence=0.7)")) _ansi.log(_ansi.gray(f" │ matched {league_found_name!r} id={league_id} (API fallback, confidence=0.7)"))
self._league_cache[key] = (league_id, 0.7) self._league_cache[key] = (league_id, 0.7)
return league_id, 0.7 return league_id, 0.7
print(_ansi.gray(" │ no league found, searching fixtures by date only (confidence=0.3)")) _ansi.log(_ansi.gray(" │ no league found, searching fixtures by date only (confidence=0.3)"))
self._league_cache[key] = (None, 0.3) self._league_cache[key] = (None, 0.3)
return None, 0.3 return None, 0.3

View File

@@ -1,3 +1,4 @@
import logging
from datetime import datetime from datetime import datetime
from typing import Any, Iterator, List, Optional from typing import Any, Iterator, List, Optional
@@ -6,6 +7,8 @@ from pydantic.dataclasses import dataclass
from beaky.config import Config from beaky.config import Config
logger = logging.getLogger(__name__)
@dataclass @dataclass
class Link: class Link:
@@ -37,7 +40,7 @@ class Links:
at least: 'id', 'link' (or 'url'), and optionally 'date' (case-insensitive). at least: 'id', 'link' (or 'url'), and optionally 'date' (case-insensitive).
Returns the list of Link objects (also stored in self.links). Returns the list of Link objects (also stored in self.links).
""" """
print("started ret_links()") logger.debug("started ret_links()")
wb = load_workbook(filename=self._path, read_only=True, data_only=True) wb = load_workbook(filename=self._path, read_only=True, data_only=True)
ws = wb.active ws = wb.active
@@ -84,7 +87,7 @@ class Links:
if id_idx is None or url_idx is None: if id_idx is None or url_idx is None:
# Required columns missing # Required columns missing
print(f"Required 'id' or 'url' column missing in header. Found headers: {list(header_map.keys())}") logger.warning("Required 'id' or 'url' column missing in header. Found headers: %s", list(header_map.keys()))
return [] return []
for row in rows: for row in rows:

View File

@@ -1,3 +1,4 @@
import logging
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@@ -6,6 +7,8 @@ from playwright.sync_api import sync_playwright
from beaky.config import Config from beaky.config import Config
from beaky.scanner.scanner import Link from beaky.scanner.scanner import Link
logger = logging.getLogger(__name__)
class Screenshotter: class Screenshotter:
def __init__(self, config: Config): def __init__(self, config: Config):
@@ -18,7 +21,7 @@ class Screenshotter:
context = browser.new_context() context = browser.new_context()
for link in links: for link in links:
print("capturing link:", link) logger.debug("capturing link: %s", link)
page = context.new_page() page = context.new_page()
target_path = Path(self.config.screenshotter.target_path) / f"{link.id}.png" target_path = Path(self.config.screenshotter.target_path) / f"{link.id}.png"
self.capture_ticket(page, link.url, target_path) self.capture_ticket(page, link.url, target_path)