Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add start and end positions in output matrix #135

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions crates/sage-cli/src/output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ impl Runner {
.format(peptide.proteins.len())
.as_bytes(),
);
record.push_field(peptide.start_position.iter().map(|&x| x.to_string()).collect::<Vec<String>>().join(";").as_bytes());
record.push_field(peptide.end_position.iter().map(|&x| x.to_string()).collect::<Vec<String>>().join(";").as_bytes());
record.push_field(filenames[feature.file_id].as_bytes());
record.push_field(feature.spec_id.as_bytes());
record.push_field(itoa::Buffer::new().format(feature.rank).as_bytes());
Expand Down Expand Up @@ -161,12 +163,14 @@ impl Runner {
"peptide",
"proteins",
"num_proteins",
"start_positions",
"end_positions",
"filename",
"scannr",
"scan",
"rank",
"label",
"expmass",
"calcmass",
"measured_mass",
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the rationale for changing column names?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, the rationale here, although I'm not wedded to them, are to 1) keep the underscore delimiting in the rest of the columns, 2) spell out calculated and experimental/measured, and 3) including number after scan doesn't feel necessary since every entry has "scan=[scan number]".

Copy link
Owner

@lazear lazear May 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not opposed to it - but the column names have been stable for ~2.5 years so I don't see a huge need to rename them (and thus force people to retool pipelines downstream of Sage). They were originally named like this because sage results (used) to be passed directly to mokapot for rescoring.

"calculated_mass",
"charge",
"peptide_len",
"missed_cleavages",
Expand Down
2 changes: 0 additions & 2 deletions crates/sage-cli/tests/integration.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
use sage_core::database::Builder;
use sage_core::enzyme::Digest;
use sage_core::mass::Tolerance;
use sage_core::peptide::Peptide;
use sage_core::scoring::Scorer;
use sage_core::spectrum::SpectrumProcessor;

Expand Down
37 changes: 30 additions & 7 deletions crates/sage/src/enzyme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,22 @@ pub struct Digest {
/// Cleaved peptide sequence
pub sequence: String,
/// Protein accession
pub protein: Arc<String>,
pub protein: ProteinAssignment,
/// Missed cleavages
pub missed_cleavages: u8,
/// Is this an N-terminal peptide of the protein?
pub position: Position,
/// What residue position does this start at (1-based inclusive)?
pub start_position: u32,
/// What residue position does this end at (1-based inclusive)?
pub end_position: u32
}

#[derive(Clone, PartialOrd, Ord, Debug, Default)]
pub struct ProteinAssignment {
identifier: Arc<String>,
start_position: u32,
end_position: u32
}

#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
Expand Down Expand Up @@ -58,6 +69,8 @@ impl Digest {
sequence: sequence.into_iter().collect(),
missed_cleavages: self.missed_cleavages,
position: self.position,
start_position: self.start_position,
end_position: self.end_position
}
}
}
Expand Down Expand Up @@ -306,6 +319,8 @@ impl EnzymeParameters {
semi_enzymatic: site.semi_enzymatic,
position,
protein: protein.clone(),
start_position: (start + 1) as u32,
end_position: end as u32
});
}
}
Expand All @@ -330,6 +345,8 @@ mod test {
missed_cleavages: 0,
position: Position::Nterm,
protein: Arc::new(String::default()),
start_position: 1,
end_position: 6
},
Digest {
decoy: false,
Expand All @@ -338,6 +355,8 @@ mod test {
missed_cleavages: 0,
position: Position::Nterm,
protein: Arc::new(String::default()),
start_position: 1,
end_position: 6
},
];

Expand All @@ -353,6 +372,8 @@ mod test {
missed_cleavages: 0,
position: Position::Nterm,
protein: Arc::new(String::default()),
start_position: 1,
end_position: 6
},
Digest {
decoy: false,
Expand All @@ -361,6 +382,8 @@ mod test {
missed_cleavages: 0,
position: Position::Internal,
protein: Arc::new(String::default()),
start_position: 7,
end_position: 12
},
];

Expand All @@ -373,11 +396,11 @@ mod test {
fn trypsin() {
let sequence = "MADEEKLPPGWEKRMSRSSGRVYYFNHITNASQWERPSGN";
let expected = vec![
("MADEEK".into(), Position::Nterm),
("LPPGWEK".into(), Position::Internal),
("MSR".into(), Position::Internal),
("SSGR".into(), Position::Internal),
("VYYFNHITNASQWERPSGN".into(), Position::Cterm),
("MADEEK".into(), Position::Nterm, 1, 6),
("LPPGWEK".into(), Position::Internal, 7, 13),
("MSR".into(), Position::Internal, 14, 16),
("SSGR".into(), Position::Internal, 17, 20),
("VYYFNHITNASQWERPSGN".into(), Position::Cterm, 21, 40),
];

let tryp = EnzymeParameters {
Expand All @@ -391,7 +414,7 @@ mod test {
expected,
tryp.digest(sequence, Arc::default())
.into_iter()
.map(|d| (d.sequence, d.position))
.map(|d| (d.sequence, d.position, d.start_position, d.end_position))
.collect::<Vec<_>>()
);
}
Expand Down
10 changes: 10 additions & 0 deletions crates/sage/src/peptide.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ pub struct Peptide {
pub position: Position,

pub proteins: Vec<Arc<String>>,
/// What residue does this peptide start at in the protein (1-based inclusive)?
pub start_position: Vec<u32>,
/// What residue does this peptide end at in the protein (1-based inclusive)?
pub end_position: Vec<u32>,
}

impl Peptide {
Expand Down Expand Up @@ -66,6 +70,8 @@ impl Debug for Peptide {
.field("monoisotopic", &self.monoisotopic)
.field("missed_cleavages", &self.missed_cleavages)
.field("position", &self.position)
.field("start_position", &self.start_position)
.field("end_position", &self.end_position)
.finish()
}
}
Expand Down Expand Up @@ -313,6 +319,8 @@ impl Peptide {
s[1..n].reverse();
pep.sequence = Arc::from(s.into_boxed_slice());
pep.modifications[1..n].reverse();
pep.start_position = pep.start_position; // TODO: calculate start/end in reversed protein sequences?
pep.end_position = pep.end_position;
}
pep
}
Expand Down Expand Up @@ -373,6 +381,8 @@ impl TryFrom<Digest> for Peptide {
missed_cleavages: value.missed_cleavages,
semi_enzymatic: value.semi_enzymatic,
proteins: vec![value.protein],
start_position: vec![value.start_position],
end_position: vec![value.end_position],
})
}
}
Expand Down