@@ -133,6 +133,8 @@ public class StructureIndexer {
133
133
static final int CODESIZE = 8 ; // 8-bit or 256
134
134
static final int CODEBOOKS = 256 ;
135
135
136
+ static final int MAX_ATOMS_V2000 = 999 ;
137
+
136
138
static final char [] ALPHA = {
137
139
'Q' ,'X' ,'Y' ,'Z' ,'U' ,'V' ,'W'
138
140
};
@@ -410,11 +412,11 @@ public Fingerprint getFpSim () {
410
412
public Chemical getMol () {
411
413
if (mol == null ) {
412
414
String mol = doc .get (FIELD_MOLFILE );
413
-
415
+ logger . finest ( String . format ( "in getMol, beginning of mol %s" , ( mol != null && mol . length () > 100 ? mol . substring ( 0 , 99 ) : "blank/short" )));
414
416
415
417
try {
416
418
417
- this .mol = Chemical .parseMol (mol );
419
+ this .mol = Chemical .parse (mol );
418
420
// try{
419
421
// this.mol.aromatize();
420
422
// }catch(Exception e){
@@ -428,10 +430,12 @@ public Chemical getMol () {
428
430
}
429
431
catch (Exception ex ) {
430
432
ex .printStackTrace ();
431
- System .err .println ("bbadmol=\n " +mol );
432
- throw new RuntimeException
433
+ String id = doc .get (FIELD_ID ) != null ? doc .get (FIELD_ID ) : "(unknown)" ;
434
+ System .err .printf ("bbadmol (id=%s=\n %s\n " , id , mol );
435
+ /*throw new RuntimeException
433
436
("Document "+doc.get(FIELD_ID)+" contains bogus "
434
- +"field " +FIELD_MOLFILE +"!\n " , ex );
437
+ +"field "+FIELD_MOLFILE+"!\n" , ex);*/
438
+ return new Chemical ();
435
439
}
436
440
}
437
441
return mol ;
@@ -1198,9 +1202,12 @@ protected void instrument (Document doc, Chemical orig)
1198
1202
byte [] fpSim = fingerprintSim .toByteArray ();
1199
1203
1200
1204
chemical .makeHydrogensExplicit ();
1201
- String indexMolHExp = chemical .toMol (new ChemFormat .MolFormatSpecification ()
1205
+ /// if atomCount >= 1000, use alternate (SMILES)
1206
+ String indexMolHExp = chemical .getAtomCount () > MAX_ATOMS_V2000 ?
1207
+ chemical .toSmiles (new ChemFormat .SmilesFormatWriterSpecification ().setKekulization (ChemFormat .KekulizationEncoding .FORCE_AROMATIC )) :
1208
+ chemical .toMol (new ChemFormat .MolFormatSpecification ()
1202
1209
.setKekulization (ChemFormat .KekulizationEncoding .FORCE_AROMATIC ));
1203
-
1210
+ logger . finest ( String . format ( "got indexMolHExp %s" , indexMolHExp ));
1204
1211
1205
1212
for (int i = 0 ; i < codebooks .length ; ++i ) {
1206
1213
Codebook cb = codebooks [i ];
0 commit comments