Skip to content

Commit

Permalink
Improve PDB formatting with incomplete Monomer info (rdkit#7286)
Browse files Browse the repository at this point in the history
* Fix PDB formatting if residue name is set but atom name is not.

* additionally, sets the width of the atom name to 4. If the name is set
  to something shorter than 4 letters, fill up with whitespace. This
  avoids bad column alignment. If the name is set to something with more
  than 4 characters, the PDB will still be invalid.

* Add a test for missing residue name and short atom names

* Fix atom name alignment in unit test

* Small updates to PDB line formatting, add a unit test

* Truncate residue names to 3 letters, and atom names to 4
* Add a unit test to ensure that atom and residue names are truncated.
* Switch from std::array<char, 2> to std::string for the atom number.
* Use name.empty() instead of name.size() == 0

---------

Co-authored-by: Franz Waibl <waiblfranz@gmail.com>
  • Loading branch information
fwaibl and Franz Waibl authored Jul 10, 2024
1 parent b44478d commit 8bc2b5d
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 30 deletions.
71 changes: 41 additions & 30 deletions Code/GraphMol/FileParsers/PDBWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,18 @@
// flavor & 32 : Write TER record

namespace RDKit {

// Get the next atom number of an element, formatted as a 2-letter string.
std::string GetDefaultAtomNumber(const Atom* atom,
std::map<unsigned int, unsigned int> &elem);

std::string GetPDBAtomLine(const Atom *atom, const Conformer *conf,
std::map<unsigned int, unsigned int> &elem) {
PRECONDITION(atom, "bad atom");
std::stringstream ss;

std::string symb = atom->getSymbol();
char at1, at2, at3, at4;
char at1, at2;
switch (symb.length()) {
case 0:
at1 = ' ';
Expand All @@ -68,13 +73,19 @@ std::string GetPDBAtomLine(const Atom *atom, const Conformer *conf,
ss << (info->getIsHeteroAtom() ? "HETATM" : "ATOM ");
ss << std::setw(5) << atom->getIdx() + 1;
ss << ' ';
ss << info->getName(); // Always 4 characters?
const std::string& name = info->getName();
if (name.empty()) {
std::string atnum = GetDefaultAtomNumber(atom, elem);
ss << at1 << at2 << atnum;
} else {
ss << std::setw(4) << name.substr(0, 4);
}
const char *ptr = info->getAltLoc().c_str();
if (*ptr == '\0') {
ptr = " ";
}
ss << *ptr;
ss << info->getResidueName(); // Always 3 characters?
ss << std::setw(3) << info->getResidueName().substr(0, 3);
ss << ' ';
ptr = info->getChainId().c_str();
if (*ptr == '\0') {
Expand All @@ -90,36 +101,11 @@ std::string GetPDBAtomLine(const Atom *atom, const Conformer *conf,
ss << " ";
} else {
info = (AtomPDBResidueInfo *)nullptr;
unsigned int atno = atom->getAtomicNum();
if (elem.find(atno) == elem.end()) {
elem[atno] = 1;
at3 = '1';
at4 = ' ';
} else {
unsigned int tmp = elem[atno] + 1;
elem[atno] = tmp;
if (tmp < 10) {
at3 = tmp + '0';
at4 = ' ';
} else if (tmp < 100) {
at3 = (tmp / 10) + '0';
at4 = (tmp % 10) + '0';
} else if (tmp < 360) {
at3 = ((tmp - 100) / 10) + 'A';
at4 = ((tmp - 100) % 10) + '0';
} else if (tmp < 1036) {
at3 = ((tmp - 360) / 26) + 'A';
at4 = ((tmp - 360) % 26) + 'A';
} else {
at3 = ' ';
at4 = ' ';
}
}

std::string atnum = GetDefaultAtomNumber(atom, elem);
ss << "HETATM";
ss << std::setw(5) << atom->getIdx() + 1;
ss << ' ';
ss << at1 << at2 << at3 << at4;
ss << at1 << at2 << atnum;
ss << " UNL 1 ";
}

Expand Down Expand Up @@ -153,6 +139,31 @@ std::string GetPDBAtomLine(const Atom *atom, const Conformer *conf,
return ss.str();
}

std::string GetDefaultAtomNumber(const Atom* atom, std::map<unsigned int, unsigned int> &elem) {
std::string ret = " ";
unsigned int atno = atom->getAtomicNum();
if (elem.find(atno) == elem.end()) {
elem[atno] = 1;
ret[0] = '1';
} else {
unsigned int tmp = elem[atno] + 1;
elem[atno] = tmp;
if (tmp < 10) {
ret[0] = tmp + '0';
} else if (tmp < 100) {
ret[0] = (tmp / 10) + '0';
ret[1] = (tmp % 10) + '0';
} else if (tmp < 360) {
ret[0] = ((tmp - 100) / 10) + 'A';
ret[1] = ((tmp - 100) % 10) + '0';
} else if (tmp < 1036) {
ret[0] = ((tmp - 360) / 26) + 'A';
ret[1] = ((tmp - 360) % 26) + 'A';
}
}
return ret;
}

std::string GetPDBBondLines(const Atom *atom, bool all, bool both, bool mult,
unsigned int &conect_count) {
PRECONDITION(atom, "bad atom");
Expand Down
55 changes: 55 additions & 0 deletions Code/GraphMol/FileParsers/file_parsers_catch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <GraphMol/FileParsers/PNGParser.h>
#include <GraphMol/FileParsers/MolFileStereochem.h>
#include <GraphMol/FileParsers/MolWriters.h>
#include <GraphMol/MonomerInfo.h>
#include <RDGeneral/FileParseException.h>
#include <boost/algorithm/string.hpp>

Expand Down Expand Up @@ -3018,6 +3019,60 @@ TEST_CASE("test bond flavors when writing PDBs", "[bug]") {
}
}

TEST_CASE("test output with incomplete monomer info", "[bug][writer]") {
SECTION("basics") {
{
auto m = "Cl"_smiles;
REQUIRE(m);
std::string pdb = MolToPDBBlock(*m, -1);
CHECK(pdb.find("HETATM 1 CL1 UNL 1") != std::string::npos);
}
{
auto m = "Cl"_smiles;
// will get deleted by ~Atom()
AtomPDBResidueInfo *info = new AtomPDBResidueInfo();
info->setResidueName("HCL");
m->getAtomWithIdx(0)->setMonomerInfo(info);
std::string pdb = MolToPDBBlock(*m, -1);
CHECK(pdb.find("ATOM 1 CL1 HCL 0") != std::string::npos);
}
{
auto m = "Cl"_smiles;
// will get deleted by ~Atom()
AtomPDBResidueInfo *info = new AtomPDBResidueInfo();
info->setResidueName("HCL");
info->setName("Cl1");
m->getAtomWithIdx(0)->setMonomerInfo(info);
std::string pdb = MolToPDBBlock(*m, -1);
CHECK(pdb.find("ATOM 1 Cl1 HCL 0") != std::string::npos);
}
{
// 1. should add spaces for padding if the atom name is too short
// 2. should add spaces for missing residue name.
auto m = "Cl"_smiles;
// will get deleted by ~Atom()
AtomPDBResidueInfo *info = new AtomPDBResidueInfo();
info->setName("CL");
m->getAtomWithIdx(0)->setMonomerInfo(info);
std::string pdb = MolToPDBBlock(*m, -1);
CHECK(pdb.find("ATOM 1 CL 0") != std::string::npos);
}
{
// 1. Test that atom names get truncated to 4 letters.
// 2. Test that residue names get truncated to 3 letters.
auto m = "Cl"_smiles;
// will get deleted by ~Atom()
AtomPDBResidueInfo *info = new AtomPDBResidueInfo();
info->setName("CHLORINE_ATOM1");
info->setResidueName("CHLORINE_MOLECULE1");
m->getAtomWithIdx(0)->setMonomerInfo(info);
std::string pdb = MolToPDBBlock(*m, -1);
std::cout << pdb << std::endl;
CHECK(pdb.find("ATOM 1 CHLO CHL 0") != std::string::npos);
}
}
}

TEST_CASE(
"github #3599: Add explicit support for remaining CTAB query bond types",
"[feature]") {
Expand Down

0 comments on commit 8bc2b5d

Please sign in to comment.