Daylily-Informatics · iamh2o · Sep 9, 2025 · Copilot · Sep 9, 2025 · Copilot
diff --git a/config/relatedness.yaml b/config/relatedness.yaml
@@ -0,0 +1,45 @@
+# REQUIRED: reference + resources
+ref_fasta: /path/to/GRCh38.fa
+
+somalier:
+  sites_vcf: /path/to/somalier-sites-hg38.vcf.bgz   # common 0.5-MAF SNP panel (bgzipped + .tbi)
+  # If you only have BAM/CRAMs, Somalier will extract from BAM with --fasta {ref_fasta}
+  # If you have VCFs with germline GTs, Somalier will extract from VCF.
+  genome_build: GRCh38
+
+picard:
+  # HAPLOTYPE_MAP from GATK resource bundle (haplotype map for fingerprinting)
+  haplotype_map: /path/to/haplotype_map_hg38.vcf.gz
+
+conpair:
+  # Conpair site lists (use hg38 bundle)
+  snp_positions_bed: /path/to/Conpair/snp_positions_hg38.bed
+  common_sites_vcf: /path/to/Conpair/common_snps_hg38.vcf.gz
+  min_baseq: 20
+  min_mapq: 10
+
+# INPUTS (either BAM/CRAM or VCF per sample; you can mix)
+samples:
+  # sample_id: {bam: "..."}  OR  {vcf: "..."}
+  HG001_N1: {bam: /data/HG001/normal1.bam}
+  HG001_N2: {bam: /data/HG001/normal2.bam}
+  PT1_T:    {bam: /data/PT1/tumor.bam}
+  PT1_N:    {bam: /data/PT1/normal.bam}
+  FATHER:   {vcf: /data/family/joint.vcf.gz}   # joint or singleton VCF allowed
+  MOTHER:   {vcf: /data/family/joint.vcf.gz}
+  CHILD:    {vcf: /data/family/joint.vcf.gz}
+
+# EXPECTED RELATIONSHIPS (used by final report for assertions)
+# relationship ∈ {identical, duplicate, tumor_normal, parent_child, siblings, unrelated}
+expected:
+  - {relationship: identical,     samples: [HG001_N1, HG001_N2]}
+  - {relationship: tumor_normal,  samples: [PT1_T, PT1_N]}
+  - {relationship: parent_child,  samples: [FATHER, CHILD]}
+  - {relationship: parent_child,  samples: [MOTHER, CHILD]}
+  - {relationship: siblings,      samples: [FATHER, MOTHER], note: "should NOT be siblings -> expect fail"}  # example negative check
+
+# OPTIONAL: run peddy on a joint VCF (pedigree checks)
+peddy:
+  enabled: false
+  joint_vcf: /data/family/joint.vcf.gz
+  ped: /data/family/family.ped
diff --git a/workflow/envs/conpair_v0.1.yaml b/workflow/envs/conpair_v0.1.yaml
@@ -0,0 +1,11 @@
+name: conpair
+channels: [conda-forge, bioconda]
+dependencies:
+  - python=3.9
+  - samtools
+  - pysam
+  - pandas
+  - numpy
+  - pip
+  - pip:
+      - git+https://github.com/nygenome/Conpair.git
diff --git a/workflow/envs/peddy_relatedness_v0.1.yaml b/workflow/envs/peddy_relatedness_v0.1.yaml
@@ -0,0 +1,7 @@
+name: peddy
+channels: [conda-forge, bioconda]
+dependencies:
+  - peddy
+  - cython
+  - pandas
+  - python>=3.9
diff --git a/workflow/envs/picard_relatedness_v0.1.yaml b/workflow/envs/picard_relatedness_v0.1.yaml
@@ -0,0 +1,5 @@
+name: picard
+channels: [conda-forge, bioconda]
+dependencies:
+  - picard=3.*
+  - openjdk=17
diff --git a/workflow/envs/relatedness_report_v0.1.yaml b/workflow/envs/relatedness_report_v0.1.yaml
@@ -0,0 +1,8 @@
+name: relatedness-report
+channels: [conda-forge, bioconda]
+dependencies:
+  - python>=3.9
+  - pandas
+  - numpy
+  - jinja2
+  - pyyaml
diff --git a/workflow/envs/somalier_v0.1.yaml b/workflow/envs/somalier_v0.1.yaml
@@ -0,0 +1,8 @@
+name: somalier
+channels: [conda-forge, bioconda]
+dependencies:
+  - somalier=0.2*
+  - htslib
+  - bcftools
+  - samtools
+  - python>=3.9
diff --git a/workflow/rules/relatedness_test_day.smk b/workflow/rules/relatedness_test_day.smk
@@ -199,51 +199,24 @@ rule conpair_compare:
             cp results/conpair/{wildcards.t}__{wildcards.n}/res_concordance_details.txt {output.conctsv}
         fi
         """
-
-
-
-
 #######################################################################
 # PEDDY (optional; requires a joint VCF)
 #######################################################################
-rule conpair_mpileup_all:
-    input:
-        expand(
-            "results/conpair/{t}__{n}/tumor.mpileup",
-            t=[p[0] for p in tn_pairs()],
-            n=[p[1] for p in tn_pairs()]
-        ),
-        expand(
-            "results/conpair/{t}__{n}/normal.mpileup",
-            t=[p[0] for p in tn_pairs()],
-            n=[p[1] for p in tn_pairs()]
-        )
 
-rule conpair_parse_all:
-    input:
-        expand(
-            "results/conpair/{t}__{n}/tumor.parsed",
-            t=[p[0] for p in tn_pairs()],
-            n=[p[1] for p in tn_pairs()]
-        ),
-        expand(
-            "results/conpair/{t}__{n}/normal.parsed",
-            t=[p[0] for p in tn_pairs()],
-            n=[p[1] for p in tn_pairs()]
-        )
+use rule conpair_mpileup as conpair_mpileup_all with:
+    wildcards:
+        t=[p[0] for p in tn_pairs()],
+        n=[p[1] for p in tn_pairs()]
 
-rule conpair_compare_all:
-    input:
-        expand(
-            "results/conpair/{t}__{n}/concordance.tsv",
-            t=[p[0] for p in tn_pairs()],
-            n=[p[1] for p in tn_pairs()]
-        ),
-        expand(
-            "results/conpair/{t}__{n}/summary.txt",
-            t=[p[0] for p in tn_pairs()],
-            n=[p[1] for p in tn_pairs()]
-        )
+use rule conpair_parse as conpair_parse_all with:
+    wildcards:
+        t=[p[0] for p in tn_pairs()],
+        n=[p[1] for p in tn_pairs()]
+
+use rule conpair_compare as conpair_compare_all with:
+    wildcards:
+        t=[p[0] for p in tn_pairs()],
+        n=[p[1] for p in tn_pairs()]
 
 rule peddy_relatedness:
     input:

diff --git a/workflow/scripts/relatedness_report.py b/workflow/scripts/relatedness_report.py
@@ -0,0 +1,203 @@
+import sys, os, yaml, pandas as pd, numpy as np
+from jinja2 import Template
+
+som_pairs = snakemake.input.som_pairs
+som_groups = snakemake.input.som_groups
+picard_metrics = snakemake.input.picard_metrics
+picard_matrix = snakemake.input.picard_matrix
+conpair_files = snakemake.input.get("conpair", [])
+out_tsv = snakemake.output.tsv
+out_html = snakemake.output.html
+cfg_path = snakemake.params.cfg
+
+with open(cfg_path) as fh:
+    CFG = yaml.safe_load(fh)
+
+# ---------------- Somalier ----------------
+sp = pd.read_csv(som_pairs, sep="\t")
+# Ensure consistent column presence
+# expected columns include: sample_a, sample_b, relatedness, ibs0, ibs2, hom_concordance, het_concordance, etc.
+sp_cols = {c.lower(): c for c in sp.columns}
+def col(name): return sp_cols.get(name, name)
+
+# Fast lookup for any pair (unordered)
+def key(a,b): return tuple(sorted([a,b]))
+sp["pair_key"] = [key(a,b) for a,b in zip(sp[col("sample_a")], sp[col("sample_b")])]
+sp_pairs = {k: row for k, row in sp.set_index("pair_key").iterrows()}
+
+# ---------------- Picard Crosscheck ----------------
+# metrics.txt has columns like: LEFT_FILE, RIGHT_FILE, RESULT, LOD_SCORE, etc.
+pm = pd.read_csv(picard_metrics, sep="\t", comment="#")
+# map file path -> sample name from config
+path2sample = {}
+for s,ent in CFG["samples"].items():
+    p = ent.get("bam") or ent.get("vcf")
+    if p: path2sample[os.path.abspath(p)] = s
+
+def file2sample(p):
+    a = os.path.abspath(p)
+    # Picard sometimes normalizes paths; try basename fallback
+    if a in path2sample: return path2sample[a]
+    base = os.path.basename(a)
+    for k,v in path2sample.items():
+        if os.path.basename(k)==base: return v
+    return base
+
+pm["left_samp"]  = pm["LEFT_FILE"].map(file2sample)
+pm["right_samp"] = pm["RIGHT_FILE"].map(file2sample)
+pm["pair_key"] = [key(a,b) for a,b in zip(pm["left_samp"], pm["right_samp"]) ]
+
+# Collapse to best (max absolute LOD) per pair
+pm_best = pm.loc[pm.groupby("pair_key")["LOD_SCORE"].apply(lambda s: s.abs().idxmax())]
+
+picard_pairs = {k: row for k,row in pm_best.set_index("pair_key").iterrows()}
+
+# ---------------- Conpair (tumor/normal) ----------------
+con_df = []
+for f in conpair_files:
+    if not os.path.exists(f): continue
+    try:
+        df = pd.read_csv(f, sep="\t")
+    except Exception:
+        # Some versions write CSV-like; be lenient
+        df = pd.read_csv(f)
+    df.columns = [c.lower() for c in df.columns]
-    df.columns = [c.lower() for c in df.columns]
-    df.columns = [c.lower() for c in df.columns]
+    # Try to capture pair names from path
+    pair = os.path.basename(os.path.dirname(f)).split("__")
+    df["sample_a"] = pair[0]
+    df["sample_b"] = pair[1]
-    df["sample_a"] = pair[0]
-    df["sample_b"] = pair[1]
+    if len(pair) >= 2:
+        df["sample_a"] = pair[0]
+        df["sample_b"] = pair[1]
+    else:
+        # Fallback: assign the whole name to sample_a, empty string to sample_b
+        df["sample_a"] = pair[0]
+        df["sample_b"] = ""
-    df["sample_a"] = pair[0]
-    df["sample_b"] = pair[1]
+    if len(pair) >= 2:
+        df["sample_a"] = pair[0]
+        df["sample_b"] = pair[1]
+    else:
+        # Fallback: assign the whole name to sample_a, empty string to sample_b
+        df["sample_a"] = pair[0]
+        df["sample_b"] = ""
+    con_df.append(df)
+con_df = pd.concat(con_df, ignore_index=True) if con_df else pd.DataFrame()
+
+# Heuristic thresholds
+THRESH = dict(
+    identical_relatedness=0.95,      # Somalier ~1.0 for same-individual/duplicate
+    tn_relatedness=0.70,             # Tumor-normal often 0.8–1.0; allow LOH/purity wiggle
+    first_degree_relatedness=0.45,   # Parent-child or full-sibs ~0.45–0.55 in Somalier
+    ibs0_parent_child=0,             # IBS0 = 0 for PO in ideal; allow <= 2 by noise
+    picard_mismatch_lod=-5.0,        # strong negative indicates mismatch
+    picard_match_lod=5.0             # strong positive indicates match
+)
+
+def get_somalier(a,b, field):
+    r = sp_pairs.get(key(a,b))
+    return None if r is None else r.get(field, r.get(field.upper()))
+
+def get_picard(a,b):
+    r = picard_pairs.get(key(a,b))
+    if r is None: return None
+    return dict(result=r.get("RESULT",""), lod=float(r.get("LOD_SCORE", np.nan)))
+
+# Evaluate expectations
+rows = []
+for exp in CFG.get("expected", []):
+    rel = exp["relationship"]
+    a, b = exp["samples"]
+    note = exp.get("note","")
+
+    som_rel = get_somalier(a,b, "relatedness")
+    ibs0 = get_somalier(a,b, "ibs0")
+    pic = get_picard(a,b)
+
+    status = "PASS"
+    fail_reasons = []
+
+    if rel in ("identical","duplicate"):
+        if som_rel is not None and som_rel < THRESH["identical_relatedness"]:
+            status = "FAIL"; fail_reasons.append(f"Somalier.relatedness={som_rel:.3f} < {THRESH['identical_relatedness']}")
+        if pic is not None and pic["lod"] < THRESH["picard_match_lod"]:
+            status = "FAIL"; fail_reasons.append(f"Picard.LOD={pic['lod']:.1f} < {THRESH['picard_match_lod']}")
+    elif rel == "tumor_normal":
+        if som_rel is not None and som_rel < THRESH["tn_relatedness"]:
+            status = "FAIL"; fail_reasons.append(f"Somalier.relatedness={som_rel:.3f} < {THRESH['tn_relatedness']}")
+        if pic is not None and pic["lod"] < 0 and pic["lod"] <= THRESH["picard_mismatch_lod"]:
-        if pic is not None and pic["lod"] < 0 and pic["lod"] <= THRESH["picard_mismatch_lod"]:
+        if pic is not None and pic["lod"] <= THRESH["picard_mismatch_lod"]:
-        if pic is not None and pic["lod"] < 0 and pic["lod"] <= THRESH["picard_mismatch_lod"]:
+        if pic is not None and pic["lod"] <= THRESH["picard_mismatch_lod"]:
+            status = "FAIL"; fail_reasons.append(f"Picard.LOD strongly negative ({pic['lod']:.1f})")
+    elif rel in ("parent_child","parent-offspring"):
+        # Expect first-degree + IBS0≈0
+        if som_rel is not None and som_rel < THRESH["first_degree_relatedness"]:
+            status = "FAIL"; fail_reasons.append(f"Somalier.relatedness={som_rel:.3f} < {THRESH['first_degree_relatedness']}")
+        if ibs0 is not None and ibs0 > 2:
+            status = "FAIL"; fail_reasons.append(f"Somalier.IBS0={ibs0} > 2 (expected ~0)")
+    elif rel == "siblings":
+        # Expect first-degree but IBS0>0 typically
+        if som_rel is not None and som_rel < THRESH["first_degree_relatedness"]:
+            status = "FAIL"; fail_reasons.append(f"Somalier.relatedness={som_rel:.3f} < {THRESH['first_degree_relatedness']}")
+        if ibs0 is not None and ibs0 <= 0:
+            status = "FAIL"; fail_reasons.append(f"Somalier.IBS0={ibs0} <= 0 (sibs typically >0)")
+    elif rel == "unrelated":
+        if som_rel is not None and som_rel >= 0.2:
+            status = "FAIL"; fail_reasons.append(f"Somalier.relatedness={som_rel:.3f} unexpectedly high for unrelated")
+        if pic is not None and pic["lod"] >= THRESH["picard_match_lod"]:
+            status = "FAIL"; fail_reasons.append(f"Picard.LOD={pic['lod']:.1f} unexpectedly high for unrelated")
+    else:
+        fail_reasons.append(f"Unknown relationship: {rel}")
+
+    rows.append(dict(
+        sample_a=a, sample_b=b, relationship=rel, note=note,
+        somalier_relatedness=som_rel,
+        somalier_ibs0=ibs0,
+        picard_result=None if pic is None else pic["result"],
+        picard_lod=None if pic is None else pic["lod"],
+        status=status, reason="; ".join(fail_reasons) if fail_reasons else ""
+    ))
+
+summary = pd.DataFrame(rows).sort_values(["status","relationship","sample_a","sample_b"])
+os.makedirs(os.path.dirname(out_tsv), exist_ok=True)
+summary.to_csv(out_tsv, sep="\t", index=False)
+
+# Minimal HTML (table + quick hints)
+tpl = Template("""
+<!doctype html><html><head>
+<meta charset="utf-8"><title>Relatedness QC</title>
+<style>body{font-family:system-ui,Arial} table{border-collapse:collapse} td,th{border:1px solid #ccc;padding:6px 8px} .FAIL{background:#ffe3e3} .PASS{background:#e6ffed}</style>
+</head><body>
+<h1>Relatedness QC</h1>
+<p><b>Samples:</b> {{ ns }} | <b>Somalier pairs:</b> {{ n_pairs }} | <b>Picard pairs:</b> {{ n_picard }}</p>
+<h2>Expectations</h2>
+<table>
+  <thead><tr>
+    <th>Status</th><th>Relationship</th><th>Sample A</th><th>Sample B</th>
+    <th>Somalier.relatedness</th><th>Somalier.IBS0</th><th>Picard.LOD</th><th>Notes / Reasons</th>
+  </tr></thead>
+  <tbody>
+  {% for r in rows %}
+  <tr class="{{ r.status }}">
+    <td>{{ r.status }}</td>
+    <td>{{ r.relationship }}</td>
+    <td>{{ r.sample_a }}</td>
+    <td>{{ r.sample_b }}</td>
+    <td>{{ "%.3f"|format(r.somalier_relatedness) if r.somalier_relatedness is not none else "" }}</td>
+    <td>{{ r.somalier_ibs0 if r.somalier_ibs0 is not none else "" }}</td>
+    <td>{{ "%.1f"|format(r.picard_lod) if r.picard_lod is not none else "" }}</td>
+    <td>{{ r.reason or r.note }}</td>
+  </tr>
+  {% endfor %}
+  </tbody>
+</table>
+
+<h2>Files</h2>
+<ul>
+  <li>Somalier: <code>results/somalier/cohort_pairs.tsv</code>, <code>cohort_groups.tsv</code>, <code>cohort.html</code></li>
+  <li>Picard: <code>results/picard/crosscheck/metrics.txt</code>, <code>matrix.txt</code></li>
+  {% if has_conpair %}<li>Conpair per TN pair: <code>results/conpair/&lt;T&gt;__&lt;N&gt;/</code></li>{% endif %}
+  {% if peddy_enabled %}<li>Peddy: <code>results/peddy/peddy.html</code></li>{% endif %}
+</ul>
+
+<p style="margin-top:24px;font-size:90%"><b>Thresholds (heuristic defaults):</b>
+identical ≥ {{th.identical_relatedness}}, tumor-normal ≥ {{th.tn_relatedness}},
+first-degree ≥ {{th.first_degree_relatedness}},
+Picard match LOD ≥ {{th.picard_match_lod}}, strong mismatch LOD ≤ {{th.picard_mismatch_lod}}.
+</p>
+</body></html>
+""")
+html = tpl.render(
+    rows=rows,
+    ns=len(CFG["samples"]),
+    n_pairs=len(sp),
+    n_picard=len(pm),
+    has_conpair=len(conpair_files)>0,
+    peddy_enabled=CFG.get("peddy",{}).get("enabled", False),
+    th=THRESH
+)
+with open(out_html,"w") as f:
+    f.write(html)