fix(v2-import): namespace source identity for snapshot upserts

This commit is contained in:
Alfredo Di Stasio
2026-03-20 14:17:56 +01:00
parent 20d3ee7dae
commit ad85e40688
13 changed files with 272 additions and 60 deletions

View File

@ -185,6 +185,15 @@ Command behavior:
- moves valid files to archive - moves valid files to archive
- moves invalid files to failed - moves invalid files to failed
### Source Identity Namespacing
Raw external IDs are **not globally unique** across basketball data sources. HoopScout v2 uses a namespaced identity for imported entities:
- `Competition`: unique key is `(source_name, source_uid)`
- `Team`: unique key is `(source_name, source_uid)`
- `Player`: unique key is `(source_name, source_uid)`
`source_uid` values from different sources (for example `lba` and `bcl`) can safely overlap without overwriting each other.
Import history is visible in Django admin: Import history is visible in Django admin:
- `ImportRun` - `ImportRun`
- `ImportFile` - `ImportFile`

View File

@ -5,9 +5,9 @@ from .models import Competition, Season, TeamSeason
@admin.register(Competition) @admin.register(Competition)
class CompetitionAdmin(admin.ModelAdmin): class CompetitionAdmin(admin.ModelAdmin):
list_display = ("name", "source_uid", "competition_type", "gender", "country", "is_active") list_display = ("name", "source_name", "source_uid", "competition_type", "gender", "country", "is_active")
list_filter = ("competition_type", "gender", "country", "is_active") list_filter = ("competition_type", "gender", "country", "is_active")
search_fields = ("name", "slug", "source_uid") search_fields = ("name", "slug", "source_name", "source_uid")
@admin.register(Season) @admin.register(Season)

View File

@ -0,0 +1,35 @@
# Generated by Django 5.2.12 on 2026-03-13 15:08
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("competitions", "0003_competition_source_uid_season_source_uid_and_more"),
]
operations = [
migrations.AddField(
model_name="competition",
name="source_name",
field=models.CharField(blank=True, default="", max_length=120),
),
migrations.AlterField(
model_name="competition",
name="source_uid",
field=models.CharField(blank=True, max_length=120, null=True),
),
migrations.AddConstraint(
model_name="competition",
constraint=models.UniqueConstraint(
condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""),
fields=("source_name", "source_uid"),
name="uq_competition_source_namespace_uid",
),
),
migrations.AddIndex(
model_name="competition",
index=models.Index(fields=["source_name", "source_uid"], name="competition_source__4c5f3d_idx"),
),
]

View File

@ -14,7 +14,8 @@ class Competition(models.Model):
name = models.CharField(max_length=220) name = models.CharField(max_length=220)
slug = models.SlugField(max_length=240, unique=True) slug = models.SlugField(max_length=240, unique=True)
source_uid = models.CharField(max_length=120, blank=True, null=True, unique=True) source_name = models.CharField(max_length=120, blank=True, default="")
source_uid = models.CharField(max_length=120, blank=True, null=True)
competition_type = models.CharField(max_length=24, choices=CompetitionType.choices) competition_type = models.CharField(max_length=24, choices=CompetitionType.choices)
gender = models.CharField(max_length=16, choices=Gender.choices, default=Gender.MEN) gender = models.CharField(max_length=16, choices=Gender.choices, default=Gender.MEN)
level = models.PositiveSmallIntegerField(default=1) level = models.PositiveSmallIntegerField(default=1)
@ -32,10 +33,16 @@ class Competition(models.Model):
class Meta: class Meta:
ordering = ["name"] ordering = ["name"]
constraints = [ constraints = [
models.UniqueConstraint(fields=["name", "country"], name="uq_competition_name_country") models.UniqueConstraint(fields=["name", "country"], name="uq_competition_name_country"),
models.UniqueConstraint(
fields=["source_name", "source_uid"],
condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""),
name="uq_competition_source_namespace_uid",
),
] ]
indexes = [ indexes = [
models.Index(fields=["name"]), models.Index(fields=["name"]),
models.Index(fields=["source_name", "source_uid"]),
models.Index(fields=["source_uid"]), models.Index(fields=["source_uid"]),
models.Index(fields=["country"]), models.Index(fields=["country"]),
models.Index(fields=["competition_type"]), models.Index(fields=["competition_type"]),

View File

@ -96,9 +96,26 @@ def _player_season_source_uid(record: dict[str, Any], source_name: str, snapshot
) )
def _source_slug(*, source_name: str, base_name: str, fallback_prefix: str, fallback_external_id: str) -> str:
base_slug = slugify(base_name) or f"{fallback_prefix}-{fallback_external_id}"
source_slug = slugify(source_name) or "snapshot"
return f"{source_slug}-{base_slug}"
def _normalized_source_name(source_name: str) -> str:
return source_name.strip().lower()
def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: date) -> None: def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: date) -> None:
competition_slug = slugify(record["competition_name"]) or f"competition-{record['competition_external_id']}" source_key = _normalized_source_name(source_name)
competition_slug = _source_slug(
source_name=source_key,
base_name=record["competition_name"],
fallback_prefix="competition",
fallback_external_id=record["competition_external_id"],
)
competition, _ = Competition.objects.update_or_create( competition, _ = Competition.objects.update_or_create(
source_name=source_key,
source_uid=record["competition_external_id"], source_uid=record["competition_external_id"],
defaults={ defaults={
"name": record["competition_name"], "name": record["competition_name"],
@ -119,8 +136,14 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d
}, },
) )
team_slug = slugify(record["team_name"]) or f"team-{record['team_external_id']}" team_slug = _source_slug(
source_name=source_key,
base_name=record["team_name"],
fallback_prefix="team",
fallback_external_id=record["team_external_id"],
)
team, _ = Team.objects.update_or_create( team, _ = Team.objects.update_or_create(
source_name=source_key,
source_uid=record["team_external_id"], source_uid=record["team_external_id"],
defaults={ defaults={
"name": record["team_name"], "name": record["team_name"],
@ -141,6 +164,7 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d
) )
player, _ = Player.objects.update_or_create( player, _ = Player.objects.update_or_create(
source_name=source_key,
source_uid=record["player_external_id"], source_uid=record["player_external_id"],
defaults={ defaults={
"first_name": record["first_name"], "first_name": record["first_name"],
@ -157,7 +181,7 @@ def _upsert_record(record: dict[str, Any], *, source_name: str, snapshot_date: d
) )
player_season, _ = PlayerSeason.objects.update_or_create( player_season, _ = PlayerSeason.objects.update_or_create(
source_uid=_player_season_source_uid(record, source_name=source_name, snapshot_date=snapshot_date), source_uid=_player_season_source_uid(record, source_name=source_key, snapshot_date=snapshot_date),
defaults={ defaults={
"player": player, "player": player,
"season": season, "season": season,

View File

@ -37,6 +37,7 @@ class PlayerCareerEntryInline(admin.TabularInline):
class PlayerAdmin(admin.ModelAdmin): class PlayerAdmin(admin.ModelAdmin):
list_display = ( list_display = (
"full_name", "full_name",
"source_name",
"source_uid", "source_uid",
"birth_date", "birth_date",
"nationality", "nationality",
@ -54,7 +55,7 @@ class PlayerAdmin(admin.ModelAdmin):
"origin_competition", "origin_competition",
"origin_team", "origin_team",
) )
search_fields = ("full_name", "first_name", "last_name", "source_uid") search_fields = ("full_name", "first_name", "last_name", "source_name", "source_uid")
inlines = (PlayerAliasInline, PlayerCareerEntryInline) inlines = (PlayerAliasInline, PlayerCareerEntryInline)
actions = ("recompute_origin_fields",) actions = ("recompute_origin_fields",)

View File

@ -0,0 +1,39 @@
# Generated by Django 5.2.12 on 2026-03-13 15:08
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("players", "0006_player_source_uid_and_more"),
]
operations = [
migrations.AddField(
model_name="player",
name="source_name",
field=models.CharField(blank=True, default="", max_length=120),
),
migrations.AlterField(
model_name="player",
name="source_uid",
field=models.CharField(blank=True, max_length=120, null=True),
),
migrations.RemoveConstraint(
model_name="player",
name="uq_player_full_name_birth_date",
),
migrations.AddConstraint(
model_name="player",
constraint=models.UniqueConstraint(
condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""),
fields=("source_name", "source_uid"),
name="uq_player_source_namespace_uid",
),
),
migrations.AddIndex(
model_name="player",
index=models.Index(fields=["source_name", "source_uid"], name="players_pla_source__73848c_idx"),
),
]

View File

@ -58,7 +58,8 @@ class Player(TimeStampedModel):
first_name = models.CharField(max_length=120) first_name = models.CharField(max_length=120)
last_name = models.CharField(max_length=120) last_name = models.CharField(max_length=120)
full_name = models.CharField(max_length=260) full_name = models.CharField(max_length=260)
source_uid = models.CharField(max_length=120, blank=True, null=True, unique=True) source_name = models.CharField(max_length=120, blank=True, default="")
source_uid = models.CharField(max_length=120, blank=True, null=True)
birth_date = models.DateField(blank=True, null=True) birth_date = models.DateField(blank=True, null=True)
nationality = models.ForeignKey( nationality = models.ForeignKey(
"players.Nationality", "players.Nationality",
@ -109,12 +110,14 @@ class Player(TimeStampedModel):
ordering = ["full_name", "id"] ordering = ["full_name", "id"]
constraints = [ constraints = [
models.UniqueConstraint( models.UniqueConstraint(
fields=["full_name", "birth_date"], fields=["source_name", "source_uid"],
name="uq_player_full_name_birth_date", condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""),
name="uq_player_source_namespace_uid",
) )
] ]
indexes = [ indexes = [
models.Index(fields=["full_name"]), models.Index(fields=["full_name"]),
models.Index(fields=["source_name", "source_uid"]),
models.Index(fields=["source_uid"]), models.Index(fields=["source_uid"]),
models.Index(fields=["last_name", "first_name"]), models.Index(fields=["last_name", "first_name"]),
models.Index(fields=["birth_date"]), models.Index(fields=["birth_date"]),

View File

@ -5,6 +5,6 @@ from .models import Team
@admin.register(Team) @admin.register(Team)
class TeamAdmin(admin.ModelAdmin): class TeamAdmin(admin.ModelAdmin):
list_display = ("name", "source_uid", "short_name", "country", "is_national_team") list_display = ("name", "source_name", "source_uid", "short_name", "country", "is_national_team")
list_filter = ("is_national_team", "country") list_filter = ("is_national_team", "country")
search_fields = ("name", "short_name", "slug", "source_uid") search_fields = ("name", "short_name", "slug", "source_name", "source_uid")

View File

@ -0,0 +1,35 @@
# Generated by Django 5.2.12 on 2026-03-13 15:08
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("teams", "0002_team_source_uid_team_teams_team_source__940258_idx"),
]
operations = [
migrations.AddField(
model_name="team",
name="source_name",
field=models.CharField(blank=True, default="", max_length=120),
),
migrations.AlterField(
model_name="team",
name="source_uid",
field=models.CharField(blank=True, max_length=120, null=True),
),
migrations.AddConstraint(
model_name="team",
constraint=models.UniqueConstraint(
condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""),
fields=("source_name", "source_uid"),
name="uq_team_source_namespace_uid",
),
),
migrations.AddIndex(
model_name="team",
index=models.Index(fields=["source_name", "source_uid"], name="teams_team_source__8035ae_idx"),
),
]

View File

@ -5,7 +5,8 @@ class Team(models.Model):
name = models.CharField(max_length=200) name = models.CharField(max_length=200)
short_name = models.CharField(max_length=80, blank=True) short_name = models.CharField(max_length=80, blank=True)
slug = models.SlugField(max_length=220, unique=True) slug = models.SlugField(max_length=220, unique=True)
source_uid = models.CharField(max_length=120, blank=True, null=True, unique=True) source_name = models.CharField(max_length=120, blank=True, default="")
source_uid = models.CharField(max_length=120, blank=True, null=True)
country = models.ForeignKey( country = models.ForeignKey(
"players.Nationality", "players.Nationality",
on_delete=models.SET_NULL, on_delete=models.SET_NULL,
@ -21,11 +22,17 @@ class Team(models.Model):
class Meta: class Meta:
ordering = ["name"] ordering = ["name"]
constraints = [ constraints = [
models.UniqueConstraint(fields=["name", "country"], name="uq_team_name_country") models.UniqueConstraint(fields=["name", "country"], name="uq_team_name_country"),
models.UniqueConstraint(
fields=["source_name", "source_uid"],
condition=models.Q(source_uid__isnull=False) & ~models.Q(source_uid=""),
name="uq_team_source_namespace_uid",
),
] ]
indexes = [ indexes = [
models.Index(fields=["name"]), models.Index(fields=["name"]),
models.Index(fields=["slug"]), models.Index(fields=["slug"]),
models.Index(fields=["source_name", "source_uid"]),
models.Index(fields=["source_uid"]), models.Index(fields=["source_uid"]),
models.Index(fields=["country"]), models.Index(fields=["country"]),
models.Index(fields=["is_national_team"]), models.Index(fields=["is_national_team"]),

View File

@ -51,6 +51,14 @@ def _valid_payload() -> dict:
} }
def _valid_payload_for_source(source_name: str, *, competition_name: str = "NBA", team_name: str = "Los Angeles Lakers") -> dict:
payload = _valid_payload()
payload["source_name"] = source_name
payload["records"][0]["competition_name"] = competition_name
payload["records"][0]["team_name"] = team_name
return payload
def _write_json(path: Path, payload: dict) -> None: def _write_json(path: Path, payload: dict) -> None:
path.write_text(json.dumps(payload), encoding="utf-8") path.write_text(json.dumps(payload), encoding="utf-8")
@ -87,9 +95,9 @@ def test_valid_snapshot_import(tmp_path, settings):
assert (archive / "nba-2026-03-13.json").exists() assert (archive / "nba-2026-03-13.json").exists()
assert not (incoming / "nba-2026-03-13.json").exists() assert not (incoming / "nba-2026-03-13.json").exists()
assert Competition.objects.filter(source_uid="comp-nba").exists() assert Competition.objects.filter(source_name="official_site_feed", source_uid="comp-nba").exists()
assert Team.objects.filter(source_uid="team-lal").exists() assert Team.objects.filter(source_name="official_site_feed", source_uid="team-lal").exists()
assert Player.objects.filter(source_uid="player-23").exists() assert Player.objects.filter(source_name="official_site_feed", source_uid="player-23").exists()
assert Season.objects.filter(source_uid="season:2025-2026").exists() assert Season.objects.filter(source_uid="season:2025-2026").exists()
assert PlayerSeason.objects.count() == 1 assert PlayerSeason.objects.count() == 1
assert PlayerSeasonStats.objects.count() == 1 assert PlayerSeasonStats.objects.count() == 1
@ -187,3 +195,59 @@ def test_same_run_second_file_same_checksum_is_skipped(tmp_path, settings):
assert files["a.json"].status == ImportFile.FileStatus.SUCCESS assert files["a.json"].status == ImportFile.FileStatus.SUCCESS
assert files["b.json"].status == ImportFile.FileStatus.SKIPPED assert files["b.json"].status == ImportFile.FileStatus.SKIPPED
assert files["a.json"].checksum == files["b.json"].checksum assert files["a.json"].checksum == files["b.json"].checksum
@pytest.mark.django_db
def test_same_raw_external_ids_from_different_sources_do_not_collide(tmp_path, settings):
incoming = tmp_path / "incoming"
archive = tmp_path / "archive"
failed = tmp_path / "failed"
incoming.mkdir()
archive.mkdir()
failed.mkdir()
lba_payload = _valid_payload_for_source("lba", competition_name="Lega Basket Serie A", team_name="Virtus Bologna")
bcl_payload = _valid_payload_for_source("bcl", competition_name="Basketball Champions League", team_name="AEK Athens")
_write_json(incoming / "lba.json", lba_payload)
_write_json(incoming / "bcl.json", bcl_payload)
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
settings.STATIC_DATASET_FAILED_DIR = str(failed)
call_command("import_snapshots")
assert Competition.objects.filter(source_uid="comp-nba").count() == 2
assert Team.objects.filter(source_uid="team-lal").count() == 2
assert Player.objects.filter(source_uid="player-23").count() == 2
assert Competition.objects.filter(source_name="lba", source_uid="comp-nba", name="Lega Basket Serie A").exists()
assert Competition.objects.filter(source_name="bcl", source_uid="comp-nba", name="Basketball Champions League").exists()
assert Team.objects.filter(source_name="lba", source_uid="team-lal", name="Virtus Bologna").exists()
assert Team.objects.filter(source_name="bcl", source_uid="team-lal", name="AEK Athens").exists()
@pytest.mark.django_db
def test_reimport_same_source_payload_remains_idempotent(tmp_path, settings):
incoming = tmp_path / "incoming"
archive = tmp_path / "archive"
failed = tmp_path / "failed"
incoming.mkdir()
archive.mkdir()
failed.mkdir()
payload = _valid_payload_for_source("lba")
_write_json(incoming / "lba-1.json", payload)
settings.STATIC_DATASET_INCOMING_DIR = str(incoming)
settings.STATIC_DATASET_ARCHIVE_DIR = str(archive)
settings.STATIC_DATASET_FAILED_DIR = str(failed)
call_command("import_snapshots")
_write_json(incoming / "lba-2.json", payload)
call_command("import_snapshots")
assert Competition.objects.filter(source_name="lba", source_uid="comp-nba").count() == 1
assert Team.objects.filter(source_name="lba", source_uid="team-lal").count() == 1
assert Player.objects.filter(source_name="lba", source_uid="player-23").count() == 1

View File

@ -8,59 +8,31 @@ from apps.competitions.models import Competition, Season
from apps.ingestion.models import ImportFile, ImportRun from apps.ingestion.models import ImportFile, ImportRun
from apps.players.models import Nationality, Player, Position, Role from apps.players.models import Nationality, Player, Position, Role
from apps.scouting.models import FavoritePlayer, SavedSearch from apps.scouting.models import FavoritePlayer, SavedSearch
from apps.stats.models import PlayerSeason
from apps.teams.models import Team from apps.teams.models import Team
@pytest.mark.django_db @pytest.mark.django_db
def test_player_unique_full_name_birth_date_constraint(): def test_source_uid_uniqueness_is_scoped_by_source_name():
nationality = Nationality.objects.create(name="Italy", iso2_code="IT", iso3_code="ITA") Season.objects.create(
position = Position.objects.create(code="PG", name="Point Guard")
role = Role.objects.create(code="playmaker", name="Playmaker")
Player.objects.create(
first_name="Marco",
last_name="Rossi",
full_name="Marco Rossi",
birth_date=date(2001, 1, 1),
nationality=nationality,
nominal_position=position,
inferred_role=role,
source_uid="player-src-1",
)
with pytest.raises(IntegrityError):
Player.objects.create(
first_name="Marco",
last_name="Rossi",
full_name="Marco Rossi",
birth_date=date(2001, 1, 1),
nationality=nationality,
nominal_position=position,
inferred_role=role,
)
@pytest.mark.django_db
def test_source_uid_uniqueness_on_core_entities():
season = Season.objects.create(
source_uid="season-2024", source_uid="season-2024",
label="2024-2025", label="2024-2025",
start_date=date(2024, 10, 1), start_date=date(2024, 10, 1),
end_date=date(2025, 6, 30), end_date=date(2025, 6, 30),
) )
competition = Competition.objects.create( Competition.objects.create(
source_name="lba",
source_uid="comp-001", source_uid="comp-001",
name="Serie A", name="Serie A",
slug="serie-a", slug="serie-a",
competition_type=Competition.CompetitionType.LEAGUE, competition_type=Competition.CompetitionType.LEAGUE,
) )
team = Team.objects.create(source_uid="team-001", name="Virtus Bologna", slug="virtus-bologna") Team.objects.create(source_name="lba", source_uid="team-001", name="Virtus Bologna", slug="virtus-bologna")
nationality = Nationality.objects.create(name="Spain", iso2_code="ES", iso3_code="ESP") nationality = Nationality.objects.create(name="Spain", iso2_code="ES", iso3_code="ESP")
position = Position.objects.create(code="SF", name="Small Forward") position = Position.objects.create(code="SF", name="Small Forward")
role = Role.objects.create(code="wing", name="Wing") role = Role.objects.create(code="wing", name="Wing")
player = Player.objects.create( Player.objects.create(
source_name="lba",
source_uid="player-001", source_uid="player-001",
first_name="Juan", first_name="Juan",
last_name="Perez", last_name="Perez",
@ -71,16 +43,32 @@ def test_source_uid_uniqueness_on_core_entities():
inferred_role=role, inferred_role=role,
) )
PlayerSeason.objects.create( Competition.objects.create(
source_uid="ps-001", source_name="bcl",
player=player, source_uid="comp-001",
season=season, name="BCL",
team=team, slug="bcl",
competition=competition, competition_type=Competition.CompetitionType.INTERNATIONAL,
)
Team.objects.create(source_name="bcl", source_uid="team-001", name="AEK", slug="aek")
Player.objects.create(
source_name="bcl",
source_uid="player-001",
first_name="Juan",
last_name="Perez",
full_name="Juan Perez",
birth_date=date(2000, 5, 1),
nationality=nationality,
nominal_position=position,
inferred_role=role,
) )
assert Competition.objects.filter(source_uid="comp-001").count() == 2
assert Team.objects.filter(source_uid="team-001").count() == 2
assert Player.objects.filter(source_uid="player-001").count() == 2
with pytest.raises(IntegrityError): with pytest.raises(IntegrityError):
Team.objects.create(source_uid="team-001", name="Another Team", slug="another-team") Team.objects.create(source_name="lba", source_uid="team-001", name="Another Team", slug="another-team")
@pytest.mark.django_db @pytest.mark.django_db