ged = Gedcom(file, tree)
- # Deduplicate names by string representation
+ # Deduplicate names by string representation (deterministic: first alphabetically wins)
def merge_names(target_set, source_set):
- existing_names = {str(n) for n in target_set}
- for n in source_set:
- if str(n) not in existing_names:
+ # Combine all names and sort deterministically
+ all_names = list(target_set) + list(source_set)
+ all_names.sort(key=lambda x: (
+ str(x),
+ x.given or "",
+ x.surname or "",
+ x.prefix or "",
+ x.suffix or "",
+ x.kind or "",
+ str(x.alternative) if hasattr(x, 'alternative') else "",
+ x.note.text if hasattr(x, 'note') and x.note else "",
+ ))
+ # Rebuild target_set keeping first occurrence by string
+ target_set.clear()
+ seen = set()
+ for n in all_names:
+ s = str(n)
+ if s not in seen:
target_set.add(n)
- existing_names.add(str(n))
+ seen.add(s)
# Helper for whitespace normalization in quotes
def norm_space(s):
# and should be updated whenever fixtures are regenerated.
export EXPECTED_ADA_LINES=11587
export EXPECTED_MARIE_LINES=3698
-export EXPECTED_MERGED_LINES=14481
+export EXPECTED_MERGED_LINES=14483
else:
print(f"✓ Marie Curie (Part 2) lines verified ({l_part2}).")
- if l_merged != exp_merged:
- print(f"❌ Assertion Failed: Merged line count {l_merged} != {exp_merged}")
+ # Check merged file with exact diff (no line count tolerance)
+ diff_result = subprocess.run(
+ ["git", "diff", "--no-index", "--exit-code", "--color=always", str(merged), str(ARTIFACTS_DIR / "merged_scientists.ged")],
+ )
+ if diff_result.returncode != 0:
+ print(f"❌ Merged file differs from artifact (see diff above)")
failed = True
else:
- print(f"✓ Merged lines verified ({l_merged}).")
+ print(f"✓ Merged file matches artifact exactly ({l_merged} lines).")
if failed:
sys.exit(1)