diff --git a/README.md b/README.md index a63738d..1544817 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,15 @@ Command behavior: - moves valid files to archive - moves invalid files to failed +### Source Identity Namespacing + +Raw external IDs are **not globally unique** across basketball data sources. HoopScout v2 uses a namespaced identity for imported entities: +- `Competition`: unique key is `(source_name, source_uid)` +- `Team`: unique key is `(source_name, source_uid)` +- `Player`: unique key is `(source_name, source_uid)` + +`source_uid` values from different sources (for example `lba` and `bcl`) can safely overlap without overwriting each other. + Import history is visible in Django admin: - `ImportRun` - `ImportFile` diff --git a/apps/competitions/admin.py b/apps/competitions/admin.py index 9796d9d..a592d42 100644 --- a/apps/competitions/admin.py +++ b/apps/competitions/admin.py @@ -5,9 +5,9 @@ from .models import Competition, Season, TeamSeason @admin.register(Competition) class CompetitionAdmin(admin.ModelAdmin): - list_display = ("name", "source_uid", "competition_type", "gender", "country", "is_active") + list_display = ("name", "source_name", "source_uid", "competition_type", "gender", "country", "is_active") list_filter = ("competition_type", "gender", "country", "is_active") - search_fields = ("name", "slug", "source_uid") + search_fields = ("name", "slug", "source_name", "source_uid") @admin.register(Season) diff --git a/apps/competitions/migrations/0004_competition_source_namespaced_uid.py b/apps/competitions/migrations/0004_competition_source_namespaced_uid.py new file mode 100644 index 0000000..ed0ffaa --- /dev/null +++ b/apps/competitions/migrations/0004_competition_source_namespaced_uid.py @@ -0,0 +1,35 @@ +# Generated by Django 5.2.12 on 2026-03-13 15:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("competitions", "0003_competition_source_uid_season_source_uid_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="competition", + name="source_name", + field=models.CharField(blank=True, default="", max_length=120), + ), + migrations.AlterField( + model_name="competition", + name="source_uid", + field=models.CharField(blank=True, max_length=120, null=True), + ), + migrations.AddConstraint( + model_name="competition", + constraint=models.UniqueConstraint( + condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""), + fields=("source_name", "source_uid"), + name="uq_competition_source_namespace_uid", + ), + ), + migrations.AddIndex( + model_name="competition", + index=models.Index(fields=["source_name", "source_uid"], name="competition_source__4c5f3d_idx"), + ), + ] diff --git a/apps/competitions/models.py b/apps/competitions/models.py index aa928f9..d04a404 100644 --- a/apps/competitions/models.py +++ b/apps/competitions/models.py @@ -14,7 +14,8 @@ class Competition(models.Model): name = models.CharField(max_length=220) slug = models.SlugField(max_length=240, unique=True) - source_uid = models.CharField(max_length=120, blank=True, null=True, unique=True) + source_name = models.CharField(max_length=120, blank=True, default="") + source_uid = models.CharField(max_length=120, blank=True, null=True) competition_type = models.CharField(max_length=24, choices=CompetitionType.choices) gender = models.CharField(max_length=16, choices=Gender.choices, default=Gender.MEN) level = models.PositiveSmallIntegerField(default=1) @@ -32,10 +33,16 @@ class Competition(models.Model): class Meta: ordering = ["name"] constraints = [ - models.UniqueConstraint(fields=["name", "country"], name="uq_competition_name_country") + models.UniqueConstraint(fields=["name", "country"], name="uq_competition_name_country"), + models.UniqueConstraint( + fields=["source_name", "source_uid"], + condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""), + name="uq_competition_source_namespace_uid", + ), ] indexes = [ models.Index(fields=["name"]), + models.Index(fields=["source_name", "source_uid"]), models.Index(fields=["source_uid"]), models.Index(fields=["country"]), models.Index(fields=["competition_type"]), diff --git a/apps/ingestion/services/snapshot_import.py b/apps/ingestion/services/snapshot_import.py index 4640db8..d596013 100644 --- a/apps/ingestion/services/snapshot_import.py +++ b/apps/ingestion/services/snapshot_import.py @@ -96,9 +96,26 @@ def _player_season_source_uid(record: dict[str, Any], source_name: str, snapshot ) +def _source_slug(*, source_name: str, base_name: str, fallback_prefix: str, fallback_external_id: str) -> str: + base_slug = slugify(base_name) or f"{fallback_prefix}-{fallback_external_id}" + source_slug = slugify(source_name) or "snapshot" + return f"{source_slug}-{base_slug}" + + +def _normalized_source_name(source_name: str) -> str: + return source_name.strip().lower() + + def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: date) -> None: - competition_slug = slugify(record["competition_name"]) or f"competition-{record['competition_external_id']}" + source_key = _normalized_source_name(source_name) + competition_slug = _source_slug( + source_name=source_key, + base_name=record["competition_name"], + fallback_prefix="competition", + fallback_external_id=record["competition_external_id"], + ) competition, _ = Competition.objects.update_or_create( + source_name=source_key, source_uid=record["competition_external_id"], defaults={ "name": record["competition_name"], @@ -119,8 +136,14 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d }, ) - team_slug = slugify(record["team_name"]) or f"team-{record['team_external_id']}" + team_slug = _source_slug( + source_name=source_key, + base_name=record["team_name"], + fallback_prefix="team", + fallback_external_id=record["team_external_id"], + ) team, _ = Team.objects.update_or_create( + source_name=source_key, source_uid=record["team_external_id"], defaults={ "name": record["team_name"], @@ -141,6 +164,7 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d ) player, _ = Player.objects.update_or_create( + source_name=source_key, source_uid=record["player_external_id"], defaults={ "first_name": record["first_name"], @@ -157,7 +181,7 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d ) player_season, _ = PlayerSeason.objects.update_or_create( - source_uid=_player_season_source_uid(record, source_name=source_name, snapshot_date=snapshot_date), + source_uid=_player_season_source_uid(record, source_name=source_key, snapshot_date=snapshot_date), defaults={ "player": player, "season": season, diff --git a/apps/players/admin.py b/apps/players/admin.py index 3bc762f..f7f6cd7 100644 --- a/apps/players/admin.py +++ b/apps/players/admin.py @@ -37,6 +37,7 @@ class PlayerCareerEntryInline(admin.TabularInline): class PlayerAdmin(admin.ModelAdmin): list_display = ( "full_name", + "source_name", "source_uid", "birth_date", "nationality", @@ -54,7 +55,7 @@ class PlayerAdmin(admin.ModelAdmin): "origin_competition", "origin_team", ) - search_fields = ("full_name", "first_name", "last_name", "source_uid") + search_fields = ("full_name", "first_name", "last_name", "source_name", "source_uid") inlines = (PlayerAliasInline, PlayerCareerEntryInline) actions = ("recompute_origin_fields",) diff --git a/apps/players/migrations/0007_player_source_namespaced_uid.py b/apps/players/migrations/0007_player_source_namespaced_uid.py new file mode 100644 index 0000000..f16a86b --- /dev/null +++ b/apps/players/migrations/0007_player_source_namespaced_uid.py @@ -0,0 +1,39 @@ +# Generated by Django 5.2.12 on 2026-03-13 15:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("players", "0006_player_source_uid_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="player", + name="source_name", + field=models.CharField(blank=True, default="", max_length=120), + ), + migrations.AlterField( + model_name="player", + name="source_uid", + field=models.CharField(blank=True, max_length=120, null=True), + ), + migrations.RemoveConstraint( + model_name="player", + name="uq_player_full_name_birth_date", + ), + migrations.AddConstraint( + model_name="player", + constraint=models.UniqueConstraint( + condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""), + fields=("source_name", "source_uid"), + name="uq_player_source_namespace_uid", + ), + ), + migrations.AddIndex( + model_name="player", + index=models.Index(fields=["source_name", "source_uid"], name="players_pla_source__73848c_idx"), + ), + ] diff --git a/apps/players/models.py b/apps/players/models.py index adae7fb..b78cc28 100644 --- a/apps/players/models.py +++ b/apps/players/models.py @@ -58,7 +58,8 @@ class Player(TimeStampedModel): first_name = models.CharField(max_length=120) last_name = models.CharField(max_length=120) full_name = models.CharField(max_length=260) - source_uid = models.CharField(max_length=120, blank=True, null=True, unique=True) + source_name = models.CharField(max_length=120, blank=True, default="") + source_uid = models.CharField(max_length=120, blank=True, null=True) birth_date = models.DateField(blank=True, null=True) nationality = models.ForeignKey( "players.Nationality", @@ -109,12 +110,14 @@ class Player(TimeStampedModel): ordering = ["full_name", "id"] constraints = [ models.UniqueConstraint( - fields=["full_name", "birth_date"], - name="uq_player_full_name_birth_date", + fields=["source_name", "source_uid"], + condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""), + name="uq_player_source_namespace_uid", ) ] indexes = [ models.Index(fields=["full_name"]), + models.Index(fields=["source_name", "source_uid"]), models.Index(fields=["source_uid"]), models.Index(fields=["last_name", "first_name"]), models.Index(fields=["birth_date"]), diff --git a/apps/teams/admin.py b/apps/teams/admin.py index 9f3d80c..6102e47 100644 --- a/apps/teams/admin.py +++ b/apps/teams/admin.py @@ -5,6 +5,6 @@ from .models import Team @admin.register(Team) class TeamAdmin(admin.ModelAdmin): - list_display = ("name", "source_uid", "short_name", "country", "is_national_team") + list_display = ("name", "source_name", "source_uid", "short_name", "country", "is_national_team") list_filter = ("is_national_team", "country") - search_fields = ("name", "short_name", "slug", "source_uid") + search_fields = ("name", "short_name", "slug", "source_name", "source_uid") diff --git a/apps/teams/migrations/0003_team_source_namespaced_uid.py b/apps/teams/migrations/0003_team_source_namespaced_uid.py new file mode 100644 index 0000000..9f3d92e --- /dev/null +++ b/apps/teams/migrations/0003_team_source_namespaced_uid.py @@ -0,0 +1,35 @@ +# Generated by Django 5.2.12 on 2026-03-13 15:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("teams", "0002_team_source_uid_team_teams_team_source__940258_idx"), + ] + + operations = [ + migrations.AddField( + model_name="team", + name="source_name", + field=models.CharField(blank=True, default="", max_length=120), + ), + migrations.AlterField( + model_name="team", + name="source_uid", + field=models.CharField(blank=True, max_length=120, null=True), + ), + migrations.AddConstraint( + model_name="team", + constraint=models.UniqueConstraint( + condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""), + fields=("source_name", "source_uid"), + name="uq_team_source_namespace_uid", + ), + ), + migrations.AddIndex( + model_name="team", + index=models.Index(fields=["source_name", "source_uid"], name="teams_team_source__8035ae_idx"), + ), + ] diff --git a/apps/teams/models.py b/apps/teams/models.py index c359dce..97a5947 100644 --- a/apps/teams/models.py +++ b/apps/teams/models.py @@ -5,7 +5,8 @@ class Team(models.Model): name = models.CharField(max_length=200) short_name = models.CharField(max_length=80, blank=True) slug = models.SlugField(max_length=220, unique=True) - source_uid = models.CharField(max_length=120, blank=True, null=True, unique=True) + source_name = models.CharField(max_length=120, blank=True, default="") + source_uid = models.CharField(max_length=120, blank=True, null=True) country = models.ForeignKey( "players.Nationality", on_delete=models.SET_NULL, @@ -21,11 +22,17 @@ class Team(models.Model): class Meta: ordering = ["name"] constraints = [ - models.UniqueConstraint(fields=["name", "country"], name="uq_team_name_country") + models.UniqueConstraint(fields=["name", "country"], name="uq_team_name_country"), + models.UniqueConstraint( + fields=["source_name", "source_uid"], + condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""), + name="uq_team_source_namespace_uid", + ), ] indexes = [ models.Index(fields=["name"]), models.Index(fields=["slug"]), + models.Index(fields=["source_name", "source_uid"]), models.Index(fields=["source_uid"]), models.Index(fields=["country"]), models.Index(fields=["is_national_team"]), diff --git a/tests/test_import_snapshots_command.py b/tests/test_import_snapshots_command.py index 67b279f..ce47446 100644 --- a/tests/test_import_snapshots_command.py +++ b/tests/test_import_snapshots_command.py @@ -51,6 +51,14 @@ def _valid_payload() -> dict: } +def _valid_payload_for_source(source_name: str, *, competition_name: str = "NBA", team_name: str = "Los Angeles Lakers") -> dict: + payload = _valid_payload() + payload["source_name"] = source_name + payload["records"][0]["competition_name"] = competition_name + payload["records"][0]["team_name"] = team_name + return payload + + def _write_json(path: Path, payload: dict) -> None: path.write_text(json.dumps(payload), encoding="utf-8") @@ -87,9 +95,9 @@ def test_valid_snapshot_import(tmp_path, settings): assert (archive / "nba-2026-03-13.json").exists() assert not (incoming / "nba-2026-03-13.json").exists() - assert Competition.objects.filter(source_uid="comp-nba").exists() - assert Team.objects.filter(source_uid="team-lal").exists() - assert Player.objects.filter(source_uid="player-23").exists() + assert Competition.objects.filter(source_name="official_site_feed", source_uid="comp-nba").exists() + assert Team.objects.filter(source_name="official_site_feed", source_uid="team-lal").exists() + assert Player.objects.filter(source_name="official_site_feed", source_uid="player-23").exists() assert Season.objects.filter(source_uid="season:2025-2026").exists() assert PlayerSeason.objects.count() == 1 assert PlayerSeasonStats.objects.count() == 1 @@ -187,3 +195,59 @@ def test_same_run_second_file_same_checksum_is_skipped(tmp_path, settings): assert files["a.json"].status == ImportFile.FileStatus.SUCCESS assert files["b.json"].status == ImportFile.FileStatus.SKIPPED assert files["a.json"].checksum == files["b.json"].checksum + + +@pytest.mark.django_db +def test_same_raw_external_ids_from_different_sources_do_not_collide(tmp_path, settings): + incoming = tmp_path / "incoming" + archive = tmp_path / "archive" + failed = tmp_path / "failed" + incoming.mkdir() + archive.mkdir() + failed.mkdir() + + lba_payload = _valid_payload_for_source("lba", competition_name="Lega Basket Serie A", team_name="Virtus Bologna") + bcl_payload = _valid_payload_for_source("bcl", competition_name="Basketball Champions League", team_name="AEK Athens") + + _write_json(incoming / "lba.json", lba_payload) + _write_json(incoming / "bcl.json", bcl_payload) + + settings.STATIC_DATASET_INCOMING_DIR = str(incoming) + settings.STATIC_DATASET_ARCHIVE_DIR = str(archive) + settings.STATIC_DATASET_FAILED_DIR = str(failed) + + call_command("import_snapshots") + + assert Competition.objects.filter(source_uid="comp-nba").count() == 2 + assert Team.objects.filter(source_uid="team-lal").count() == 2 + assert Player.objects.filter(source_uid="player-23").count() == 2 + assert Competition.objects.filter(source_name="lba", source_uid="comp-nba", name="Lega Basket Serie A").exists() + assert Competition.objects.filter(source_name="bcl", source_uid="comp-nba", name="Basketball Champions League").exists() + assert Team.objects.filter(source_name="lba", source_uid="team-lal", name="Virtus Bologna").exists() + assert Team.objects.filter(source_name="bcl", source_uid="team-lal", name="AEK Athens").exists() + + +@pytest.mark.django_db +def test_reimport_same_source_payload_remains_idempotent(tmp_path, settings): + incoming = tmp_path / "incoming" + archive = tmp_path / "archive" + failed = tmp_path / "failed" + incoming.mkdir() + archive.mkdir() + failed.mkdir() + + payload = _valid_payload_for_source("lba") + _write_json(incoming / "lba-1.json", payload) + + settings.STATIC_DATASET_INCOMING_DIR = str(incoming) + settings.STATIC_DATASET_ARCHIVE_DIR = str(archive) + settings.STATIC_DATASET_FAILED_DIR = str(failed) + + call_command("import_snapshots") + + _write_json(incoming / "lba-2.json", payload) + call_command("import_snapshots") + + assert Competition.objects.filter(source_name="lba", source_uid="comp-nba").count() == 1 + assert Team.objects.filter(source_name="lba", source_uid="team-lal").count() == 1 + assert Player.objects.filter(source_name="lba", source_uid="player-23").count() == 1 diff --git a/tests/test_models_domain.py b/tests/test_models_domain.py index 15256a6..1366cfd 100644 --- a/tests/test_models_domain.py +++ b/tests/test_models_domain.py @@ -8,59 +8,31 @@ from apps.competitions.models import Competition, Season from apps.ingestion.models import ImportFile, ImportRun from apps.players.models import Nationality, Player, Position, Role from apps.scouting.models import FavoritePlayer, SavedSearch -from apps.stats.models import PlayerSeason from apps.teams.models import Team @pytest.mark.django_db -def test_player_unique_full_name_birth_date_constraint(): - nationality = Nationality.objects.create(name="Italy", iso2_code="IT", iso3_code="ITA") - position = Position.objects.create(code="PG", name="Point Guard") - role = Role.objects.create(code="playmaker", name="Playmaker") - - Player.objects.create( - first_name="Marco", - last_name="Rossi", - full_name="Marco Rossi", - birth_date=date(2001, 1, 1), - nationality=nationality, - nominal_position=position, - inferred_role=role, - source_uid="player-src-1", - ) - - with pytest.raises(IntegrityError): - Player.objects.create( - first_name="Marco", - last_name="Rossi", - full_name="Marco Rossi", - birth_date=date(2001, 1, 1), - nationality=nationality, - nominal_position=position, - inferred_role=role, - ) - - -@pytest.mark.django_db -def test_source_uid_uniqueness_on_core_entities(): - season = Season.objects.create( +def test_source_uid_uniqueness_is_scoped_by_source_name(): + Season.objects.create( source_uid="season-2024", label="2024-2025", start_date=date(2024, 10, 1), end_date=date(2025, 6, 30), ) - competition = Competition.objects.create( + Competition.objects.create( + source_name="lba", source_uid="comp-001", name="Serie A", slug="serie-a", competition_type=Competition.CompetitionType.LEAGUE, ) - team = Team.objects.create(source_uid="team-001", name="Virtus Bologna", slug="virtus-bologna") + Team.objects.create(source_name="lba", source_uid="team-001", name="Virtus Bologna", slug="virtus-bologna") nationality = Nationality.objects.create(name="Spain", iso2_code="ES", iso3_code="ESP") position = Position.objects.create(code="SF", name="Small Forward") role = Role.objects.create(code="wing", name="Wing") - player = Player.objects.create( + Player.objects.create( + source_name="lba", source_uid="player-001", first_name="Juan", last_name="Perez", @@ -71,16 +43,32 @@ def test_source_uid_uniqueness_on_core_entities(): inferred_role=role, ) - PlayerSeason.objects.create( - source_uid="ps-001", - player=player, - season=season, - team=team, - competition=competition, + Competition.objects.create( + source_name="bcl", + source_uid="comp-001", + name="BCL", + slug="bcl", + competition_type=Competition.CompetitionType.INTERNATIONAL, + ) + Team.objects.create(source_name="bcl", source_uid="team-001", name="AEK", slug="aek") + Player.objects.create( + source_name="bcl", + source_uid="player-001", + first_name="Juan", + last_name="Perez", + full_name="Juan Perez", + birth_date=date(2000, 5, 1), + nationality=nationality, + nominal_position=position, + inferred_role=role, ) + assert Competition.objects.filter(source_uid="comp-001").count() == 2 + assert Team.objects.filter(source_uid="team-001").count() == 2 + assert Player.objects.filter(source_uid="player-001").count() == 2 + with pytest.raises(IntegrityError): - Team.objects.create(source_uid="team-001", name="Another Team", slug="another-team") + Team.objects.create(source_name="lba", source_uid="team-001", name="Another Team", slug="another-team") @pytest.mark.django_db