Skip to content

Commit f0cb545

Browse files
committed
optionally, shorten monomer names also for mmCIF files
because old programs may not support 5-character residue names gemmi convert: added option --shorten-tlc (TLC=three-letter-code, which actually may not be three-, and may have digits) shortened and original names are stored in _chem_comp.id and _chem_comp.three_letter_code. When reading a file, original names are automatically restored.
1 parent dd709b6 commit f0cb545

File tree

5 files changed

+40
-8
lines changed

5 files changed

+40
-8
lines changed

docs/convert-help.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Any output options:
3737
--shorten Shorten chain names to 1 (if # < 63) or 2 characters.
3838
--rename-chain=OLD:NEW Rename chain OLD to NEW (--rename-chain=:A adds
3939
missing chain IDs).
40+
--shorten-tlc Change 5-character monomer names to 3-char. aliases.
4041
--monomer=OLD:NEW Change monomer name (CCD code) OLD to NEW.
4142
-s FILE Use sequence(s) from FILE in PIR or FASTA format. Each
4243
chain is assigned the best matching sequence, if any.

docs/mol.rst

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1992,11 +1992,16 @@ with two functions:
19921992

19931993
* ``restore_full_ccd_codes()`` restores the original names.
19941994

1995-
When reading a PDB file with the tilde-hetnam extension,
1996-
the long names are restored automatically. Apart from this,
1997-
switching between long and short names requires function calls.
1995+
When reading a file with monomer names shortened in a gemmi-compatible way:
19981996

1999-
Internally, the mapping between old and new names is stored in
1997+
* the tilde-hetnam extension in PDB
1998+
* shortened and original names in ``_chem_comp.id`` and
1999+
``_chem_comp.three_letter_code`` in mmCIF,
2000+
2001+
the long names are automatically restored. Apart from this,
2002+
switching between the long and short names requires function calls.
2003+
2004+
Internally, the mapping between names is stored in
20002005
``Structure::shortened_ccd_codes``.
20012006

20022007
.. doctest::

prog/convert.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ enum OptionIndex {
4747
ExpandNcs, AsAssembly,
4848
RemoveH, RemoveWaters, RemoveLigWat, TrimAla, Select, Remove, ApplySymop,
4949
Reframe, ShortTer, Linkr, CopyRemarks, Minimal, ShortenCN, RenameChain,
50-
ChangeCcdCode, SetSeq,
50+
ShortenTLC, ChangeCcdCode, SetSeq,
5151
SiftsNum, Biso, Anisou, SetCis, SegmentAsChain, OldPdb, ForceLabel
5252
};
5353

@@ -103,6 +103,8 @@ const option::Descriptor Usage[] = {
103103
{ RenameChain, 0, "", "rename-chain", Arg::ColonPair,
104104
" --rename-chain=OLD:NEW \tRename chain OLD to NEW "
105105
"(--rename-chain=:A adds missing chain IDs)." },
106+
{ ShortenTLC, 0, "", "shorten-tlc", Arg::None,
107+
" --shorten-tlc \tChange 5-character monomer names to 3-char. aliases." },
106108
{ ChangeCcdCode, 0, "", "monomer", Arg::ColonPair,
107109
" --monomer=OLD:NEW \tChange monomer name (CCD code) OLD to NEW." },
108110
{ SetSeq, 0, "s", "", Arg::Required,
@@ -359,6 +361,9 @@ void convert(gemmi::Structure& st,
359361
for (gemmi::Model& model : st.models)
360362
split_chains_by_segments(model, gemmi::HowToNameCopiedChain::Dup);
361363

364+
if (options[ShortenTLC] || output_type == CoorFormat::Pdb)
365+
shorten_ccd_codes(st);
366+
362367
gemmi::Ofstream os(output, &std::cout);
363368

364369
if (output_type == CoorFormat::Mmcif || output_type == CoorFormat::Mmjson) {
@@ -383,7 +388,6 @@ void convert(gemmi::Structure& st,
383388
writer.write_json(doc);
384389
}
385390
} else if (output_type == CoorFormat::Pdb) {
386-
shorten_ccd_codes(st);
387391
gemmi::PdbWriteOptions opt;
388392
if (options[Minimal])
389393
opt = gemmi::PdbWriteOptions::minimal();

src/mmcif.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <gemmi/mmcif_impl.hpp> // for set_cell_from_mmcif
66
#include <gemmi/atox.hpp> // for string_to_int
77
#include <gemmi/enumstr.hpp> // for entity_type_from_string, polymer_type_from_string
8+
#include <gemmi/polyheur.hpp> // for restore_full_ccd_codes
89

910
namespace gemmi {
1011

@@ -895,6 +896,17 @@ Structure make_structure_from_block(const cif::Block& block_) {
895896
st.assemblies = read_assemblies(block);
896897
read_sifts_unp(block, st);
897898

899+
cif::Table chem_comp_table = block.find("_chem_comp.", {"id", "three_letter_code"});
900+
if (chem_comp_table.ok()) {
901+
for (auto row : chem_comp_table) {
902+
std::string alias = row.str(0);
903+
std::string long_id = row.str(1);
904+
if (!alias.empty() && !long_id.empty() && alias != long_id && alias.back() == '~')
905+
st.shortened_ccd_codes.emplace_back(long_id, alias);
906+
}
907+
restore_full_ccd_codes(st);
908+
}
909+
898910
return st;
899911
}
900912

src/to_mmcif.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -583,8 +583,18 @@ void update_mmcif_block(const Structure& st, cif::Block& block, MmcifOutputGroup
583583
for (const std::string& item : ent.full_sequence)
584584
resnames.insert(Entity::first_mon(item));
585585
cif::Loop& chem_comp_loop = block.init_mmcif_loop("_chem_comp.", {"id", "type"});
586-
for (const std::string& name : resnames)
587-
chem_comp_loop.add_row({cif::quote(name), "."});
586+
if (!st.shortened_ccd_codes.empty())
587+
chem_comp_loop.tags.push_back("_chem_comp.three_letter_code");
588+
for (const std::string& name : resnames) {
589+
chem_comp_loop.values.push_back(cif::quote(name));
590+
chem_comp_loop.values.push_back(".");
591+
if (!st.shortened_ccd_codes.empty()) {
592+
chem_comp_loop.values.push_back(cif::quote(name));
593+
for (const auto& old_new : st.shortened_ccd_codes)
594+
if (old_new.second == name)
595+
chem_comp_loop.values.back() = old_new.first;
596+
}
597+
}
588598
}
589599

590600
if (groups.exptl) {

0 commit comments

Comments
 (0)