Skip to content

Commit 51fbd67

Browse files
Merge pull request #13 from ncats/index_large_mols
Index large mols
2 parents 118149b + c77e726 commit 51fbd67

File tree

4 files changed

+2875
-8
lines changed

4 files changed

+2875
-8
lines changed

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<groupId>gov.nih.ncats</groupId>
55
<artifactId>structure-indexer</artifactId>
66
<packaging>jar</packaging>
7-
<version>0.0.15</version>
7+
<version>0.0.16</version>
88
<name>structure-indexer</name>
99
<url>https://github.com/ncats/structure-indexer</url>
1010
<description>This is a self-contained structure indexer that uses Lucene as the underlying storage and indexing engine.</description>

src/main/java/gov/nih/ncats/structureIndexer/StructureIndexer.java

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@ public class StructureIndexer {
133133
static final int CODESIZE = 8; // 8-bit or 256
134134
static final int CODEBOOKS = 256;
135135

136+
static final int MAX_ATOMS_V2000 = 999;
137+
136138
static final char[] ALPHA = {
137139
'Q','X','Y','Z','U','V','W'
138140
};
@@ -410,11 +412,11 @@ public Fingerprint getFpSim () {
410412
public Chemical getMol () {
411413
if (mol == null) {
412414
String mol = doc.get(FIELD_MOLFILE);
413-
415+
logger.finest(String.format("in getMol, beginning of mol %s", (mol != null && mol.length() > 100 ?mol.substring(0, 99) : "blank/short")));
414416

415417
try {
416418

417-
this.mol = Chemical.parseMol(mol);
419+
this.mol = Chemical.parse(mol);
418420
// try{
419421
// this.mol.aromatize();
420422
// }catch(Exception e){
@@ -428,10 +430,12 @@ public Chemical getMol () {
428430
}
429431
catch (Exception ex) {
430432
ex.printStackTrace();
431-
System.err.println("bbadmol=\n"+mol);
432-
throw new RuntimeException
433+
String id = doc.get(FIELD_ID) != null ? doc.get(FIELD_ID) : "(unknown)";
434+
System.err.printf("bbadmol (id=%s=\n%s\n", id, mol);
435+
/*throw new RuntimeException
433436
("Document "+doc.get(FIELD_ID)+" contains bogus "
434-
+"field "+FIELD_MOLFILE+"!\n" , ex);
437+
+"field "+FIELD_MOLFILE+"!\n" , ex);*/
438+
return new Chemical();
435439
}
436440
}
437441
return mol;
@@ -1198,9 +1202,12 @@ protected void instrument (Document doc, Chemical orig)
11981202
byte[] fpSim = fingerprintSim.toByteArray();
11991203

12001204
chemical.makeHydrogensExplicit();
1201-
String indexMolHExp = chemical.toMol(new ChemFormat.MolFormatSpecification()
1205+
/// if atomCount >= 1000, use alternate (SMILES)
1206+
String indexMolHExp = chemical.getAtomCount() > MAX_ATOMS_V2000 ?
1207+
chemical.toSmiles(new ChemFormat.SmilesFormatWriterSpecification().setKekulization(ChemFormat.KekulizationEncoding.FORCE_AROMATIC)) :
1208+
chemical.toMol(new ChemFormat.MolFormatSpecification()
12021209
.setKekulization(ChemFormat.KekulizationEncoding.FORCE_AROMATIC));
1203-
1210+
logger.finest(String.format("got indexMolHExp %s", indexMolHExp));
12041211

12051212
for (int i = 0; i < codebooks.length; ++i) {
12061213
Codebook cb = codebooks[i];

0 commit comments

Comments
 (0)