Debugging KeyError for water missing H and for Deuterium

lcmd-epfl · Nov 13, 2024 · 57f53b6 · 57f53b6
1 parent 2ec3135
commit 57f53b6
Show file tree

Hide file tree

Showing 6 changed files with 67 additions and 49 deletions.
diff --git a/cell2mol/charge_assignment.py b/cell2mol/charge_assignment.py
@@ -410,7 +410,7 @@ def get_protonation_states_specie(specie: object, debug: int=0) -> list:
                         if ligand.NO_type == "Linear":
                             if debug >= 2: print("        GET_PROTONATION_STATES: Found Linear Nitrosyl")
                             elemlist[idx] = "O"
-                            addedlist[idx] = 2
+                            addedlist[idx] = 1
                             metal_electrons[idx] = 1
                         elif ligand.NO_type == "Bent":
                             if debug >= 2: print("        GET_PROTONATION_STATES: Found Bent Nitrosyl")

diff --git a/cell2mol/classes.py b/cell2mol/classes.py
@@ -1334,14 +1334,12 @@ def get_reference_molecules(self, ref_labels: list, ref_fracs: list, cov_factor:
         refcell.get_subtype("reference")
         # Get reference molecules
         blocklist = split_species(ref_labels, ref_pos, cov_factor=cov_factor)
-        print(blocklist)
         self.refmoleclist = []
         for b in blocklist:
             mol_labels       = extract_from_list(b, ref_labels, dimension=1)
             mol_coord        = extract_from_list(b, ref_pos, dimension=1)
             mol_frac_coord   = extract_from_list(b, ref_fracs, dimension=1)
             newmolec         = molecule(mol_labels, mol_coord, mol_frac_coord)
-            print(newmolec)
             newmolec.add_parent(self, indices=b)
             newmolec.add_parent(refcell, indices=b)
             newmolec.set_adjacency_parameters(cov_factor, metal_factor)
@@ -1354,20 +1352,20 @@ def get_reference_molecules(self, ref_labels: list, ref_fracs: list, cov_factor:
 
         if debug >= 0: print(f"GETREFS: found {len(self.refmoleclist)} reference molecules")
         if debug >= 0: print(f"GETREFS:", [ref.formula for ref in self.refmoleclist])
-        if debug >= 0: print(f"GETREFS: {self.refmoleclist}")
         # Checks for isolated atoms, and retrieves warning if there is any. Except if it is H, halogen (group 17) or alkalyne (group 2)
         isgood = True 
         for ref in self.refmoleclist:
             if ref.natoms == 1:
                 label = ref.atoms[0].label
                 group = elemdatabase.elementgroup[label]
-                if (group == 1 or group == 2 or group == 17) and label != "H": pass 
-                else:
+                if label == "H" or label == "D": 
                     isgood = False
+                else: #(group == 1 or group == 2 or group == 17)
                     if debug >= 0: print(f"GETREFS: found ref molecule with only one atom {ref.labels}")
 
         # If all good, then works with the reference molecules
         if isgood:
+            self.has_isolated_H = False
             for ref in self.refmoleclist:
                 if debug >= 0: print(f"GETREFS: working with {ref.formula}")
                 if ref.iscomplex: 
@@ -1377,8 +1375,9 @@ def get_reference_molecules(self, ref_labels: list, ref_fracs: list, cov_factor:
                     for met in ref.metals:                         
                         met.get_coordination_geometry(debug=debug)
                         met.get_coord_sphere_formula()
-        if isgood: self.has_isolated_H = False
-        else:      self.has_isolated_H = True
+        else:      
+            self.has_isolated_H = True
+
         return self.refmoleclist
 
     #######################################################

diff --git a/cell2mol/missingH.py b/cell2mol/missingH.py
@@ -108,15 +108,15 @@ def check_missingH(refmoleclist: list, debug: int=0):
     # List of Metal Atoms for which O atoms might appear connected directly.
     Exceptions_for_CoordWater = ["Re", "V", "Mo", "W", "Fe"]
 
-    if debug >= 2: print("")
-    if debug >= 2: print("##################")
-    if debug >= 2: print("Checking Missing H")
-    if debug >= 2: print("##################")
+    if debug >= 1: print("")
+    if debug >= 1: print("##################")
+    if debug >= 1: print("Checking Missing H")
+    if debug >= 1: print("##################")
     for idx, ref in enumerate(refmoleclist):
         if not ref.iscomplex:
             if ref.natoms == 1 and "O" in ref.labels: 
                 Missing_H_in_CoordWater = True
-                if debug >= 2: print(f"WARNING found isolated O atom in the cell. This tends to be a water with missing H, so stopping")
+                if debug >= 1: print(f"WARNING found isolated O atom in the cell. This tends to be a water with missing H, so stopping")
             else:
                 for kdx, a in enumerate(ref.atoms):
                     if not hasattr(a,"adjacency"): continue 
@@ -133,19 +133,19 @@ def check_missingH(refmoleclist: list, debug: int=0):
                             for label, coord in zip(bonded_atom_labels, bonded_atom_coord):
                                 print("Dist", f"{a.label}-{label}", get_dist(a.coord, coord))
                         if ismissingH:
-                            if debug >= 2: print("")
-                            if debug >= 2: print(f"WARNING in Missing H function for: {ref.type}, {idx}, {ref.labels}")
-                            if debug >= 2: print(f"C Atom {kdx} {a.get_parent_index('molecule')} has missing H atoms")
-                            if debug >= 2: print(report)
+                            if debug >= 1: print("")
+                            if debug >= 1: print(f"WARNING in Missing H function for: {ref.type}, {idx}, {ref.labels}")
+                            if debug >= 1: print(f"C Atom {kdx} {a.get_parent_index('molecule')} has missing H atoms")
+                            if debug >= 1: print(report)
                             Missing_H_in_C = True
         else:
             for jdx, lig in enumerate(ref.ligands):
                 if lig.natoms == 1 and "O" in lig.labels and lig.denticity <= 1:
                     if any(m.label in Exceptions_for_CoordWater for m in lig.metals): pass
                     else:
                         Missing_H_in_CoordWater = True
-                        if debug >= 2: print("")
-                        if debug >= 2: print("WARNING in Missing H function for ligand",lig.natoms,lig.labels)
+                        if debug >= 1: print("")
+                        if debug >= 1: print("WARNING in Missing H function for ligand",lig.natoms,lig.labels)
                 else:
                     for kdx, a in enumerate(lig.atoms):
                         if a.label == "C" and a.mconnec == 0:
@@ -158,14 +158,14 @@ def check_missingH(refmoleclist: list, debug: int=0):
                             if debug >= 2: print("Adjacency", a.adjacency, bonded_atom_labels)
                             ismissingH, report = get_missingH_from_adjacency(a.atnum, a.coord, bonded_atom_coord)
                             if ismissingH:
-                                if debug >= 2: print("")
-                                if debug >= 2: print(f"WARNING in Missing H function for: {ref.type}, {idx}, {jdx}, {lig.labels}")
-                                if debug >= 2: print(f"C Atom {kdx} {a.get_parent_index('molecule')} has missing H atoms")
-                                if debug >= 2: print(report)
+                                if debug >= 1: print("")
+                                if debug >= 1: print(f"WARNING in Missing H function for: {ref.type}, {idx}, {jdx}, {lig.labels}")
+                                if debug >= 1: print(f"C Atom {kdx} {a.get_parent_index('molecule')} has missing H atoms")
+                                if debug >= 1: print(report)
                                 Missing_H_in_C = True
 
     if Missing_H_in_C or Missing_H_in_CoordWater:  Warning = True
     if not Warning:
-        if debug >= 2: print("Not a Single Molecule has Missing H atoms (apparently)")
+        if debug >= 1: print("Not a Single Molecule has Missing H atoms (apparently)")
 
     return Warning, ismissingH, Missing_H_in_C, Missing_H_in_CoordWater
diff --git a/cell2mol/new_cell_reconstruction.py b/cell2mol/new_cell_reconstruction.py
@@ -6,8 +6,9 @@
 from cell2mol.connectivity import split_species, count_species, compare_reference_indices
 from cell2mol.cell_operations import translate
 from itertools import combinations
-# from cell2mol.read_write import writexyz
-# import pickle
+from cell2mol.elementdata import ElementData
+elemdatabase = ElementData()
+
 
 ######################################################
 def modify_cov_factor_due_to_H (refcell, debug: int=0):
@@ -16,11 +17,11 @@ def modify_cov_factor_due_to_H (refcell, debug: int=0):
         refcell.check_missing_H(debug=debug)                                     
     else:
         if debug >= 1: print(f"Initial covalent factor: {cov_factor=} before increasing")
-        while refcell.has_isolated_H :
+        while refcell.has_isolated_H and cov_factor < 1.5:
             # Increase covalent factor for H atoms
             cov_factor += 0.05
             refcell.get_reference_molecules(refcell.labels, refcell.frac_coord, cov_factor=cov_factor, debug=0)
-        if debug >= 1: print(f"Covalent factor increases: {cov_factor=}")
+            if debug >= 1: print(f"Covalent factor increases: {cov_factor=}")
         refcell.check_missing_H(debug=debug)
     refcell.assess_errors(mode="hydrogens")
     return refcell
@@ -63,16 +64,20 @@ def apply_symmetry_operations_reference (refcell, cell_vector, sym_ops, normaliz
     new_structures = []
     ref_labels = refcell.labels
     fractional_coords = np.array(refcell.frac_coord)
-    
-    # reference : ase atoms object
-    reference = Atoms(symbols=ref_labels, scaled_positions=fractional_coords, cell=cell_vector, pbc=pbc)
+
+    if "D" in ref_labels:
+        numbers = [elemdatabase.elementnr[elem] for elem in ref_labels] # Atoms object cannot handle Deuterium in the symbols
 
     for rot, trans in zip(sym_ops[0], sym_ops[1]):
         transformed_positions = np.dot(fractional_coords, rot.T)
         transformed_positions += np.array(trans)       
         if normalize:
             transformed_positions = np.remainder(transformed_positions, 1)
-        new = Atoms(symbols=ref_labels, scaled_positions=transformed_positions, cell=cell_vector)    
+        if "D" in ref_labels:
+            new = Atoms(scaled_positions=transformed_positions, numbers=numbers, cell=cell_vector)
+        else:
+            new = Atoms(symbols=ref_labels, scaled_positions=transformed_positions, cell=cell_vector)
+
         new_structures.append(new)
 
     return new_structures
@@ -400,7 +405,7 @@ def fragments_reconstruct (subset_remaining_fragments, target_ref, cell_vector,
     return list_of_found_molecules, list_of_bigger_fragments
 
 ######################################################
-def get_updated_indices(sp_idx, new, cell_labels, cell_pos, cell_fracs, debug: int=0):
+def get_updated_indices(sp_idx, ref_labels, new, cell_labels, cell_pos, cell_fracs, debug: int=0):
     """
     sp_idx : index of the symmetry operation
     new : ase atoms object by applying symmetry operations to the reference structure
@@ -409,11 +414,11 @@ def get_updated_indices(sp_idx, new, cell_labels, cell_pos, cell_fracs, debug: i
     cell_fracs : list of fractional coordinates of the atoms in the unit cell
     """
     indices_lists = []
-    new_labels =  new.get_chemical_symbols()
+    # new_labels =  new.get_chemical_symbols()
     new_pos = new.get_positions()    
     new_fracs = new.get_scaled_positions()
 
-    for jdx, (n_l, n_p, n_f) in enumerate(zip(new_labels, new_pos, new_fracs)):
+    for jdx, (n_l, n_p, n_f) in enumerate(zip(ref_labels, new_pos, new_fracs)):
         for kdx, (l, p, f) in enumerate(zip(cell_labels, cell_pos, cell_fracs)):
             if n_l == l and np.allclose(n_p, p, atol=1e-4, rtol=1e-2):
                 if np.allclose(np.remainder(n_f, 1), np.remainder(f, 1), atol=1e-4, rtol=1e-2):
@@ -423,7 +428,7 @@ def get_updated_indices(sp_idx, new, cell_labels, cell_pos, cell_fracs, debug: i
     return indices_lists
 
 ######################################################
-def get_updated_indices_old (new, cell_labels, cell_pos, cell_fracs, all_found, debug: int=0):
+def get_updated_indices_old (ref_labels, new, cell_labels, cell_pos, cell_fracs, all_found, debug: int=0):
 
     # new structure : ase atoms object by applying symmetry operations to the reference structure
     # Find the indices of the atoms in the unit cell that have the same cartisian coordinates as the atoms in the new structure
@@ -435,8 +440,9 @@ def get_updated_indices_old (new, cell_labels, cell_pos, cell_fracs, all_found,
     updated = [i for i in found_indices if i not in all_found]
 
     new_fracs = new.get_scaled_positions()
-    new_labels =  new.get_chemical_symbols()
-
+    # new_labels =  new.get_chemical_symbols()
+    new_labels = ref_labels
+
     indices_in_ref = [ find_row_index_from_matrix(new_fracs, cell_fracs[u]) for u in updated ]
     if debug >= 2:
         print("Get indices in reference")
@@ -538,10 +544,17 @@ def determine_wrap_keywords_pbc (atoms,refcell, wrap_keywords, debug: int=0):
 def reconstuct (refcell, newcell, sym_ops, debug: int=0):
 
     cell_labels = newcell.labels
+    print(f"{cell_labels=}")
+    if "D" in cell_labels:
+        print("Deuterium is in the cell")
     cell_pos = newcell.coord
     cell_fracs = newcell.frac_coord
     cell_vector = newcell.cell_vector
-
+
+    ref_labels = refcell.labels
+    print(f"{ref_labels=}")
+    if "D" in ref_labels:
+        print("Deuterium is in the reference")
     cov_factor = refcell.refmoleclist[0].cov_factor
     metal_factor = refcell.refmoleclist[0].metal_factor
 
@@ -555,7 +568,7 @@ def reconstuct (refcell, newcell, sym_ops, debug: int=0):
 
     for idx, new in enumerate(new_structures):
         print(f"\nApplying symmetry operations to reference {idx}")
-        indices_lists = get_updated_indices(idx, new, cell_labels, cell_pos, cell_fracs, debug=debug)
+        indices_lists = get_updated_indices(idx, ref_labels, new, cell_labels, cell_pos, cell_fracs, debug=debug)
         print(f"{len(indices_lists)=}")
 
         updated_lists = [i for i in indices_lists if i[1] not in all_found]   

diff --git a/cell2mol/new_charge_assignment.py b/cell2mol/new_charge_assignment.py
@@ -141,7 +141,7 @@ def set_charge_state(reference, target, mode, debug: int=0):
             target_data = target.get_parent_indices("reference")
             index_map = {value: index for index, value in enumerate(target_data)}
             sorted_indices = sorted(range(len(ref_data)), key=lambda i: index_map[ref_data[i]])            
-
+            print(f"SET_CHARGE_STATE:{sorted_indices=}")
             reordered_prot = temp_prot.reorder(sorted_indices)
             # reordered_prot.coords = target.coord
             if debug >=1 : print(f"SET_CHARGE_STATE:({target.subtype}) {target.formula} {reference.charge_state.uncorr_total_charge=} Reordered {sorted_indices=}")

diff --git a/cell2mol/unitcell.py b/cell2mol/unitcell.py
@@ -40,22 +40,28 @@ def process_unitcell(input_path, name, current_dir, debug=0):
             newcell = create_unitcell_object(name, cell_labels, cell_pos, cell_fracs, cell_vector, cell_param, "unitcell")
 
             perform_cell2mol(newcell, refcell, sym_ops, cell_fname, ref_cell_fname, debug)
+
+            # Handle error cases for the unit cell
+            if hasattr(newcell, 'error_case'):
+                error_fname = os.path.join(current_dir, f"unitcell_error_{newcell.error_case}.out")
+                with open(error_fname, "w") as error_output:
+                    with redirect_stdout(error_output):
+                        handle_error(newcell.error_case)
     else:
         logging.error("Error encountered while processing the reference cell")
 
-    # Handle error cases for the unit cell
-    if hasattr(newcell, 'error_case'):
-        error_fname = os.path.join(current_dir, f"unitcell_error_{newcell.error_case}.out")
-        with open(error_fname, "w") as error_output:
-            with redirect_stdout(error_output):
-                handle_error(newcell.error_case)
-
     return newcell
 
 def get_cell_parameters(structure):
     """Extracts cell parameters and symmetry operations from structure."""
     wrap_keywords = {'pbc': True, 'center': (0.5, 0.5, 0.5)}
-    cell_labels = structure.get_chemical_symbols()
+    cell_labels = []
+    for l, n, m in zip(structure.get_chemical_symbols(), structure.get_atomic_numbers(), structure.get_masses()):
+        if n == 1 and (m > 2 or  m== 2.01355) : # Deuterium
+            cell_labels.append("D")
+        else:
+            cell_labels.append(l)
+
     cell_pos = structure.get_positions(wrap=True, **wrap_keywords)
     cell_fracs = structure.get_scaled_positions()
     cell_vector = structure.cell.array