Merge pull request #8 from mmolari/marco-dev

mmolari · web-flow · commit 113604e2accd · 2022-11-24T18:10:23.000+01:00
fix: max insertion frequency
diff --git a/notes/improvements.md b/notes/improvements.md
@@ -3,6 +3,7 @@
 - [ ] Rename records in original fasta/genbank assembled genome as `vialXX_timeYY_clZZ`?
 - [x] Group together correlated deletion trajectories that span adjacen intervals.
 - [ ] parallelize insertion fisher test evaluation.
+- [ ] because of quality filtering the number of insertions might be greater than the number of reads in the pileup. To solve for this in the insertion analysis we decrease the number of insertions in this case, to have an insertion frequency not greater than one. However this might bias the estimation for insertion frequency on the rest of the sites. A proper solution would require also keeping information on the unfiltered number of reads (also an unfiltered pileup).
 - [x] Select trajectories based on delta (max - min) frequency on timepoints with high confidence.
 - [x] Use secondary/supplementary reads to find duplicated/chimeric region bridges.
 - [ ] Make the pipeline less reliant on folder structure. Pass the files directly in channels.
diff --git a/scripts/plot_insertions.py b/scripts/plot_insertions.py
@@ -54,22 +54,26 @@ def L_tot(x):
         df["If"], df["Ir"] = I[:, 0], I[:, 1]
         df["It"] = I.sum(axis=1)
 
+        # average read length
+        Ltot = np.vstack([L_tot(ins[p]) for p in pos])
+        df["Lf"] = safe_division(Ltot[:, 0], df["If"])
+        df["Lr"] = safe_division(Ltot[:, 1], df["Ir"])
+        df["Lt"] = Ltot.sum(axis=1) / df["It"]
+
         # number of reads
         df["Nf"] = stats_table.N(t, kind="fwd")[pos]
         df["Nr"] = stats_table.N(t, kind="rev")[pos]
         df["Nt"] = df["Nf"] + df["Nr"]
 
+        # renormalize because of quality filtering (the two might differ in some places)
+        df["If"] = np.minimum(df["If"],df["Nf"])
+        df["Ir"] = np.minimum(df["Ir"],df["Nr"])
+
         # frequency of insertions
         df["Ff"] = safe_division(df["If"], df["Nf"])
         df["Fr"] = safe_division(df["Ir"], df["Nr"])
         df["Ft"] = safe_division(df["It"], df["Nt"])
 
-        # average read length
-        Ltot = np.vstack([L_tot(ins[p]) for p in pos])
-        df["Lf"] = safe_division(Ltot[:, 0], df["If"])
-        df["Lr"] = safe_division(Ltot[:, 1], df["Ir"])
-        df["Lt"] = Ltot.sum(axis=1) / df["It"]
-
         # build dataframe
         dfs[t] = pd.DataFrame(df, index=pos)
     #     df = pd.concat(columns, axis=1).fillna(0).astype(int)