Skip to content

Commit

Permalink
All unclustered/unprocessed sequences output
Browse files Browse the repository at this point in the history
  • Loading branch information
rsuchecki committed Aug 19, 2021
1 parent ffb7eef commit 436873b
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 46 deletions.
2 changes: 1 addition & 1 deletion build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

<target name="-pre-init">
<!-- <property name="project.name" value="NAME" />-->
<property name="version.num" value="0.9.4" />
<property name="version.num" value="0.9.5" />
<tstamp>
<format property="NOW" pattern="yyyy-MM-dd HH:mm:ss z" />
</tstamp>
Expand Down
4 changes: 2 additions & 2 deletions src/vsearchprocess/ClusteredSampleMSA.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public class ClusteredSampleMSA {
private final ArrayList<MsaSequence> sequences;
private final String sampleId;
private ArrayList<Snp> snpsWithin;

public ClusteredSampleMSA(String sample) {
sequences = new ArrayList<>();
this.sampleId = sample;
Expand All @@ -37,7 +37,7 @@ public ClusteredSampleMSA(String sample) {
public int size() {
return sequences.size();
}

public String getSampleId() {
return sampleId;
}
Expand Down
32 changes: 17 additions & 15 deletions src/vsearchprocess/ClusteredSequencesMSA.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,17 @@
*/
public class ClusteredSequencesMSA {

// HashMap<String, ArrayList<Sequence>> map;
private HashMap<String, ClusteredSampleMSA> map;
private Integer msaAlignmentLength;
private Integer msaAlignmentLength; //All sequences' MSA length must match
private ArrayList<Snp> intraSnps = new ArrayList<>(0);
private ArrayList<Snp> interSnps = new ArrayList<>(0);
private HashMap<String, MsaSeqPair> seqPairs = new HashMap<>();

// ArrayList<Sequence> list;
private final String TOOL_NAME;

public ClusteredSequencesMSA(ArrayList<String> sampleNames, String TOOL_NAME) {
this.TOOL_NAME = TOOL_NAME;
map = new HashMap<>(sampleNames.size() * 2);
// list = new ArrayList<>();
for (String sampleName : sampleNames) {
map.put(sampleName, new ClusteredSampleMSA(sampleName));
}
Expand Down Expand Up @@ -85,7 +82,6 @@ public ArrayList<Snp> getInterSnps() {
return interSnps;
}

//
public int size() {
int size = 0;
for (ClusteredSampleMSA s : map.values()) {
Expand All @@ -99,7 +95,6 @@ public Integer getMsaAlignmentLength() {
}

public void addSequence(MsaSequence sequence) {
// list.add(sequence);
if (msaAlignmentLength == null) {
msaAlignmentLength = sequence.getLength();
} else if (msaAlignmentLength != sequence.getLength()) {
Expand Down Expand Up @@ -244,23 +239,30 @@ public ArrayList<Double> getPairwiseIntraIdenities() {
// return sb;
// }

public CharSequence getClusterForPrint(int clusterNumber, boolean suppressPadding) {
public CharSequence getClusterForPrint(Integer clusterNumber, boolean suppressPadding, Integer minLength) {
// System.out.println("\nCLUSTER: " + clusterLabel);
StringBuilder sb = new StringBuilder();
for (Sequence msaSequence : getSequencesList()) {
sb.append(">Cluster_").append(clusterNumber).append("_").append(msaSequence.getId());
sb.append(System.lineSeparator());
if (suppressPadding) {
sb.append(msaSequence.getSequenceString().replaceAll("-", "")).append(System.lineSeparator());
} else {
sb.append(msaSequence.getSequenceString()).append(System.lineSeparator());
if ((suppressPadding && msaSequence.getLengthUnpadded() >= minLength) || (!suppressPadding && msaSequence.getLength() >= minLength)) {
if (clusterNumber != null) {
sb.append(">Cluster_").append(clusterNumber).append("_");
} else {
sb.append(">");
}
sb.append(msaSequence.getId());
sb.append(System.lineSeparator());
if (suppressPadding) {
sb.append(msaSequence.getSequenceString().replaceAll("-", "")).append(System.lineSeparator());
} else {
sb.append(msaSequence.getSequenceString()).append(System.lineSeparator());
}
}
}
return sb;
}

public void printIntraSnps(int clusterNumber, boolean reverseLex, String DELIMITER,
String suffix, boolean printSequence) {
String suffix, boolean printSequence) {
StringBuilder sb = new StringBuilder();
for (Snp snp : intraSnps) {
sb.append(snp.getSnpString(clusterNumber, reverseLex, DELIMITER, suffix));
Expand All @@ -274,7 +276,7 @@ public void printIntraSnps(int clusterNumber, boolean reverseLex, String DELIMIT
}

public void printInterSnps(int clusterNumber, boolean reverseLex, String DELIMITER, String suffix,
double minInterIdentity, boolean printSequence) {
double minInterIdentity, boolean printSequence) {
StringBuilder sb = new StringBuilder();
for (Snp snp : interSnps) {
MsaSeqPair pair = seqPairs.get(snp.getSequence1().getId() + snp.getSequence2().getId());
Expand Down
57 changes: 29 additions & 28 deletions src/vsearchprocess/VsearchClustersCaller.java
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ private OptSet populateOptSet() {
optSet.addOpt(new Opt(null, "min-samples-clustered", "Minimum number of samples in an input cluster", 1).setMinValue(1).setDefaultValue(2));
optSet.addOpt(new Opt(null, "min-seqs-clustered-in", "Minimum number of sequences in an input cluster", 1).setMinValue(2).setDefaultValue(2));
optSet.addOpt(new Opt(null, "max-seqs-clustered-in", "Maximum number of sequences in an input cluster", 1).setMinValue(2).setDefaultValue(1000));
optSet.addOpt(new Opt(null, "no-merge", "Do not merge/assemble perfectly matching sequenceswithin a sample. By default, non-conflicting sequences are merged/assembled within each sample."));
optSet.incrementLisitngGroup();
optSet.setListingGroupLabel("[Variant calling and reporting]");
// optSet.addOpt(new Opt(null, "min-sequences-per-cluster", "Minimum number of sequences required for a cluster to be considered ", 1).setMinValue(2).setDefaultValue(2));
Expand All @@ -118,7 +119,8 @@ private OptSet populateOptSet() {
//// optSet.addOpt(new Opt('q', "out-queue-capacity", "Maximum number of buffers put on queue for writing-out",64, 1, 256));
optSet.addOpt(new Opt(null, "out-clusters-msa", "Output clustered sequences (for which SNPs were called) to <arg> MSA/FASTA file", 1));
optSet.addOpt(new Opt(null, "out-clusters-fasta", "Output clustered sequences (for which SNPs were called) to <arg> FASTA file", 1));
optSet.addOpt(new Opt(null, "out-unclustered-fasta", "Output unclustered sequences to <arg> FASTA file", 1));
optSet.addOpt(new Opt(null, "out-unclustered-fasta", "Output unclustered sequences to <arg> FASTA file", 1)
.addFootnote(1, "The sequences may have been clustered by VSEARCH but did not fulfill variant calling and reporting thresholds."));
optSet.addOpt(new Opt(null, "out-unclustered-min-len", "Minimum length required to output an unclustered sequence", 1).setMinValue(1).setDefaultValue(100));
optSet.addOpt(new Opt(null, "out-unvarying-clusters", "Output clusters (FASTA and/or MSA) for which no SNPs/indels called. Not reported by default"));
optSet.addOpt(new Opt('o', "stdout-redirect", "Redirect stdout to this file", 1));
Expand Down Expand Up @@ -224,15 +226,8 @@ public void readAndProcessMSASequencesFromFasta(String fileName, OptSet optSet)
} else if (line.equals(">consensus")) {
//PROCESS PREVIOUS CLUSER
// if (sequencesList.size() > 1) {
clusterNumber = processCluster(clusteredSeqs, optSet, clusterNumber, clustersFastaOut, clustersMsaOut);
clusterNumber = processCluster(clusteredSeqs, optSet, clusterNumber, clustersFastaOut, clustersMsaOut, unclusteredFastaOut, unclusteredOutMinLength);

if (unclusteredFastaOut != null && clusteredSeqs.size() == 1) {
MsaSequence seq = clusteredSeqs.getSequencesList().get(0);
if (seq.getUnpaddedLength() >= unclusteredOutMinLength) {
unclusteredFastaOut.write(clusteredSeqs.getSequencesList().get(0).getFasta(true).toString());
unclusteredFastaOut.newLine();
}
}
// }
//SKIP the consensus
// continue;
Expand Down Expand Up @@ -282,7 +277,7 @@ public void readAndProcessMSASequencesFromFasta(String fileName, OptSet optSet)
}

private int processCluster(ClusteredSequencesMSA clusteredSeqs, OptSet optSet, int clusterNumber, BufferedWriter clustersFastaOut,
BufferedWriter clustersMsaOut) throws IOException {
BufferedWriter clustersMsaOut, BufferedWriter unclusteredFastaOut, int unclusteredOutMinLength) throws IOException {

int minSamplesClustered = (int) optSet.getOpt("min-samples-clustered").getValueOrDefault();
int minSeqsClusteredIn = (int) optSet.getOpt("min-seqs-clustered-in").getValueOrDefault();
Expand All @@ -301,8 +296,10 @@ private int processCluster(ClusteredSequencesMSA clusteredSeqs, OptSet optSet, i
boolean supressIntra = optSet.getOpt("supress-intra-snps").getOptFlag();
boolean supressInter = optSet.getOpt("supress-inter-snps").getOptFlag();
boolean outputUnvaryingClusters = optSet.getOpt("out-unvarying-clusters").getOptFlag();
boolean merge = !optSet.getOpt("no-merge").getOptFlag();

boolean appendSequencesToSnpList = true;
boolean unclusteredOrLeftover = false;
int size = clusteredSeqs.size();
if (size >= minSeqsClusteredIn && size <= maxSeqsClusteredIn && clusteredSeqs.getNumClusteredSamples() >= minSamplesClustered) {
//CALL WITHIN EACH SAMPLE
Expand All @@ -313,27 +310,14 @@ private int processCluster(ClusteredSequencesMSA clusteredSeqs, OptSet optSet, i
suffix = "HAS_INTRA";
hasIntra = true;
}
// String clusterString = "ALL";
//MERGE NON-CONFLICTING SEQUENCES WITHIN EACH SAMPLE
if (clusteredSeqs.mergeSequencesWithinSamples()) {
// clusterString = "MERGED";

// MERGE NON-CONFLICTING SEQUENCES WITHIN EACH SAMPLE
if (merge && clusteredSeqs.mergeSequencesWithinSamples()) {
suffix = "MERGED";
}

//CALL BETWEEN SAMPLES
clusteredSeqs.callSNPsBetweenAllSamples(maxIndelLength, minIndelDistFromEnds);
// boolean hasInter = false;
// if (!clusteredSeqs.getInterSnps().isEmpty()) {
// hasInter = true;
// }
boolean hasInter = clusteredSeqs.hasInterSnps(minInterIdentity);
// ArrayList<Double> pairwiseIntraIdenities = clusteredSeqs.getPairwiseIntraIdenities();
// try {
// Double minIdentity = Collections.min(pairwiseIntraIdenities);
// System.out.println(minIdentity);
// } catch (NoSuchElementException e) {
// int x =0;
// }

int intra = clusteredSeqs.getIntraSnps().size();
int inter = clusteredSeqs.getInterSnps().size();
Expand All @@ -345,10 +329,10 @@ private int processCluster(ClusteredSequencesMSA clusteredSeqs, OptSet optSet, i
// clusteredSeqs.printCluster((clusterNumber) + " " + clusterString, maxIndelLength);
if ((hasIntra && !supressIntra) || (hasInter && !supressInter) || outputUnvaryingClusters) {
if (clustersFastaOut != null) {
clustersFastaOut.write(clusteredSeqs.getClusterForPrint(clusterNumber, true).toString());
clustersFastaOut.write(clusteredSeqs.getClusterForPrint(clusterNumber, true, Integer.MIN_VALUE).toString());
}
if (clustersMsaOut != null) {
clustersMsaOut.write(clusteredSeqs.getClusterForPrint(clusterNumber, false).toString());
clustersMsaOut.write(clusteredSeqs.getClusterForPrint(clusterNumber, false, Integer.MIN_VALUE).toString());
}
}
if (!supressIntra) {
Expand All @@ -357,8 +341,25 @@ private int processCluster(ClusteredSequencesMSA clusteredSeqs, OptSet optSet, i
if (!supressInter) {
clusteredSeqs.printInterSnps(clusterNumber, reverseLex, DELIMITER, suffix, minInterIdentity, appendSequencesToSnpList);
}
} else { //too many inter or intra SNPs or clustered sequeces
unclusteredOrLeftover = true;
}
} else { // too few/many clustered seqs or samples
unclusteredOrLeftover = true;
}

//out unclustered and otherwise leftover sequences
// if (unclusteredFastaOut != null && clusteredSeqs.size() == 1) {
// if (unclusteredFastaOut != null && clusteredSeqs.originalSize() == 1) {
if (unclusteredFastaOut != null && unclusteredOrLeftover) {
unclusteredFastaOut.write(clusteredSeqs.getClusterForPrint(null, true, unclusteredOutMinLength).toString());
// MsaSequence seq = clusteredSeqs.getSequencesList().get(0);
// if (seq.getUnpaddedLength() >= unclusteredOutMinLength) {
// unclusteredFastaOut.write(clusteredSeqs.getSequencesList().get(0).getFasta(true).toString());
// unclusteredFastaOut.newLine();
// }
}

return clusterNumber;
}

Expand Down

0 comments on commit 436873b

Please sign in to comment.