Source code for mcal.utils.cif_reader

"""CifReader beta (2026/01/29)"""
import os
import re
import warnings
from collections import deque
from itertools import product
from pathlib import Path
from typing import Dict, List, Literal, Tuple

import numpy as np
import pandas as pd
from numpy.typing import NDArray


[docs] class CifReader: """CifReader class. This class is used to read cif file and extract crystal information. Raises ------ ElementPropertiesIsNotDefinedError Raised when element properties is not defined. SymmetryIsNotDefinedError Raised when symmetry is not defined. ZValueIsNotMatchError Raised when z value is not match. The atomic bond detection may not be functioning correctly. """ parent_dir = Path(os.path.abspath(__file__)).parent.parent ELEMENT_PROP = pd.read_csv(f'{parent_dir}/constants/element_properties.csv').dropna(axis=0) ATOMIC_WEIGHTS = ELEMENT_PROP[['symbol', 'weight']].set_index('symbol').to_dict()['weight'] COVALENT_RADII = ELEMENT_PROP[['symbol', 'covalent_radius']].set_index('symbol').to_dict()['covalent_radius'] def __init__(self, cif_path: str) -> None: """Initialize the CifReader class. Parameters ---------- cif_path : str Path of cif file. """ self.basename = None # Crystal properties self.cell_lengths = [None, None, None] self.cell_angles = [None, None, None] self.lattice = None self.symmetry_pos = [] self.z_value = 0 self._ref_z_value = 0 # Molecule properties self.symbols = [] self.symbols_label = [] self.coordinates = [] self.sym_symbols = [] self.sym_coords = np.empty((0, 3)) # Unique molecule self.unique_symbols = {} self.unique_coords = {} self._reader(cif_path) self._calc_lattice() self._operate_sym() self.sym_symbols, self.sym_coords = self.remove_duplicates(self.sym_symbols, self.sym_coords) self._make_adjacency_mat() self._split_mols() self._put_unit_cell() self._unwrap_molecules() self._put_unit_cell() # Remove duplicates again as they may occur when moving atoms into the unit cell self.sym_symbols, self.sym_coords = self.remove_duplicates(self.sym_symbols, self.sym_coords) self._make_adjacency_mat() self._split_mols() self._calc_z_value() if self._ref_z_value != 0 and self.z_value != self._ref_z_value: raise ZValueIsNotMatchError( f'Z value is not match. Z value in cif file is {self._ref_z_value}, but calculated Z value is {self.z_value}.' ) def _apply_minimum_image(self): """Apply minimum image convention.""" frac_diff = self.sym_coords[:, np.newaxis, :] - self.sym_coords[np.newaxis, :, :] frac_diff = frac_diff - np.round(frac_diff) cart_diff = np.dot(frac_diff, self.lattice) distance = np.linalg.norm(cart_diff, axis=-1) return distance def _calc_lattice(self): """Calculate lattice.""" a, b, c = self.cell_lengths alpha, beta, gamma = tuple(map(lambda x: np.radians(x), self.cell_angles)) b_x = b * np.cos(gamma) b_y = b * np.sin(gamma) c_x = c * np.cos(beta) v = ((np.cos(alpha) - np.cos(beta) * np.cos(gamma))) / np.sin(gamma) c_y = c * v c_z = c * np.sqrt(1 - np.cos(beta)**2 - v**2) self.lattice = np.array(( (a, 0, 0), (b_x, b_y, 0), (c_x, c_y, c_z), )) def _calc_z_value(self): """Calculate z value.""" for atom_idx in self.bonded_atoms: cen_of_weight = self.calc_cen_of_weight(self.sym_coords[atom_idx]) if self._is_in_unit_cell(cen_of_weight): self.unique_symbols[self.z_value] = self.sym_symbols[atom_idx] self.unique_coords[self.z_value] = self.sym_coords[atom_idx] self.z_value += 1 def _is_in_unit_cell(self, cen_of_weight: NDArray[np.float64]) -> bool: """Determine if the center of weight is in the unit cell. Parameters ---------- cen_of_weight : NDArray[np.float64] Center of weight. Returns ------- bool True if the center of weight is in the unit cell. """ if np.all(0 <= cen_of_weight) and np.all(cen_of_weight < 1): is_in_unit_cell = True else: is_in_unit_cell = False return is_in_unit_cell def _make_adjacency_mat(self): """Determine bonding and create the adjacency matrix.""" num_atoms = len(self.sym_symbols) self.adjacency_mat = np.zeros((num_atoms, num_atoms), dtype=bool) distance = self._apply_minimum_image() try: covalent_distance = np.array([self.COVALENT_RADII[symbol] for symbol in self.sym_symbols]) \ + np.array([self.COVALENT_RADII[symbol] for symbol in self.sym_symbols])[:, np.newaxis] except KeyError: raise ElementPropertiesIsNotDefinedError('Element properties is not defined.') self.adjacency_mat[(distance <= covalent_distance * 1.3) & (distance != 0)] = 1 def _operate_sym(self) -> None: """Perform molecular symmetry operations.""" def _extract_coord(coord: NDArray[np.float64], idx: int, is_minus: bool) -> NDArray[np.float64]: """Extract coordinates from the coordinate array. Parameters ---------- coord : NDArray[np.float64] Coordinate array. idx : int Index of the coordinate to extract. is_minus : bool If True, the coordinate is extracted with a minus sign. Returns ------- NDArray[np.float64] Extracted coordinate array. """ if is_minus: return -coord[:, idx].copy() else: return coord[:, idx].copy() if len(self.symmetry_pos) == 0: raise SymmetryIsNotDefinedError('Symmetry is not defined.') self.sym_symbols = np.tile(self.symbols, len(self.symmetry_pos)) idx_fil = ('x', 'y', 'z') for pos in self.symmetry_pos: moved_coord = np.zeros(self.coordinates.shape) sum_array = np.empty(0) for i, s, in enumerate(pos.split(',')): matches = re.findall(r'[0-9]/[0-9]', s) if matches: fraction = eval(f'float({matches[0]})') else: fraction = 0 sum_array = np.append(sum_array, fraction) terms = [x.replace('+', '') for x in re.findall(r'\+?\-?[x-z]', s)] for term in terms: is_minus = False if '-' in term: is_minus = True term = term[-1] moved_coord[:, i] += _extract_coord( self.coordinates, idx_fil.index(term), is_minus ) moved_coord = sum_array + moved_coord self.sym_coords = np.append(self.sym_coords, moved_coord, axis=0) def _put_unit_cell(self) -> None: """Put molecules into unit cell.""" for atom_idx in self.bonded_atoms: for i, c in enumerate(self.calc_cen_of_weight(self.sym_coords[atom_idx])): if 1 <= c: change = -int(c) elif c < 0: change = abs(int(c)) + 1 else: change = 0 self.sym_coords[atom_idx, i] += change def _reader(self, cif_path: str) -> None: """Read cif file infomation. Parameters ---------- cif_path : str Path of cif file. """ # save index position counter = 0 atom_data_index = { '_atom_site_label': None, '_atom_site_type_symbol': None, '_atom_site_fract_x': None, '_atom_site_fract_y': None, '_atom_site_fract_z': None, } symmetry_data_index = None is_read_atom = False is_read_sym = False with open(cif_path) as f: while True: line = f.readline() if not line: break line = line.strip() # remove blank characters if not line: continue if line.startswith('data_'): self.basename = '_'.join(line.split('_')[1:]) # get unit cell information cell_params = ('_cell_length_a', '_cell_length_b', '_cell_length_c', '_cell_angle_alpha', '_cell_angle_beta', '_cell_angle_gamma') if line.startswith(tuple(cell_params)): value = float(re.sub(r'\(.*\)', '', line.split()[-1])) if line.startswith('_cell_length'): self.cell_lengths[cell_params.index(line.split()[0])] = value else: self.cell_angles[cell_params.index(line.split()[0])%3] = value elif line.startswith('_cell_formula_units_Z'): self._ref_z_value = int(line.split()[-1]) # get index position if 'loop_' == line: counter = 0 is_read_atom = False is_read_sym = False continue elif '_' == line[0]: if line in atom_data_index.keys(): atom_data_index[line] = counter is_read_atom = True is_read_sym = False elif line in ('_symmetry_equiv_pos_as_xyz', '_space_group_symop_operation_xyz'): symmetry_data_index = counter is_read_atom = False is_read_sym = True else: is_read_sym = False counter += 1 continue elif ';' == line[0]: is_read_atom = False is_read_sym = False continue if line[0] not in ('_', '#'): # get symbol and fractional coordinates if is_read_atom: tmp_atom_data = line.split() # remove disorder if '?' not in tmp_atom_data[atom_data_index['_atom_site_label']]: if atom_data_index['_atom_site_type_symbol'] is None: symbol_label = tmp_atom_data[atom_data_index['_atom_site_label']] symbol = re.match(r'[A-Z][a-z]?', symbol_label) if symbol: symbol = symbol.group(0) else: raise ValueError(f'Symbol label {symbol_label} is not valid.') else: symbol_label = tmp_atom_data[atom_data_index['_atom_site_label']] symbol = tmp_atom_data[atom_data_index['_atom_site_type_symbol']] fract_x = tmp_atom_data[atom_data_index['_atom_site_fract_x']] fract_y = tmp_atom_data[atom_data_index['_atom_site_fract_y']] fract_z = tmp_atom_data[atom_data_index['_atom_site_fract_z']] coord = [float(re.sub(r'\(.*\)', '', x)) for x in [fract_x, fract_y, fract_z]] self.symbols.append(symbol) self.symbols_label.append(symbol_label) self.coordinates.append(coord) # get symmetry operation information elif is_read_sym: if "'" in line: line = list(map(lambda x: x.strip().replace(' ', '').replace("'", ""), line.split())) else: line = list(map(lambda x: x.strip().replace(' ', ''), line.split())) self.symmetry_pos.append(line[symmetry_data_index].lower()) self.symbols = np.array(self.symbols) self.coordinates = np.array(self.coordinates) def _search_connect_atoms(self, node: int, atoms: List[int], visited: NDArray[bool], num_atoms: int) -> None: """Find bonded atoms using depth-first search. Parameters ---------- node : int Index of the atom. atoms : List[int] List of bonded atoms. visited : NDArray[bool] Array of visited atoms. num_atoms : int Number of atoms. """ visited[node] = True atoms.append(node) for i in range(num_atoms): if self.adjacency_mat[node, i] and not visited[i]: self._search_connect_atoms(i, atoms, visited, num_atoms) def _split_mols(self) -> None: """Split molecules.""" self.bonded_atoms = [] num_atoms = len(self.sym_symbols) visited = np.zeros(num_atoms, dtype=bool) for i in range(num_atoms): if not visited[i]: atoms = [] self._search_connect_atoms(i, atoms, visited, num_atoms) self.bonded_atoms.append(atoms) # get row corresponding to index self.sub_matrices = [] for index_group in self.bonded_atoms: # get row corresponding to index index_group.sort() sub_matrix = self.adjacency_mat[np.ix_(index_group, index_group)] self.sub_matrices.append(sub_matrix) def _unwrap_molecules(self): """Unwrap molecules using the adjacency matrix based on the minimum image convention.""" for atom_group in self.bonded_atoms: if len(atom_group) <= 1: continue criterion_idx = atom_group[0] # Depth-first search visited = {criterion_idx} stack = deque([criterion_idx]) while stack: current_idx = stack.popleft() current_coord = self.sym_coords[current_idx] for neighbor_idx in atom_group: if neighbor_idx in visited: continue if self.adjacency_mat[current_idx, neighbor_idx]: # Minimum image convention frac_diff = self.sym_coords[neighbor_idx] - current_coord frac_diff = frac_diff - np.round(frac_diff) self.sym_coords[neighbor_idx] = current_coord + frac_diff visited.add(neighbor_idx) stack.append(neighbor_idx)
[docs] def calc_cen_of_weight(self, coordinates: NDArray[np.float64]) -> NDArray[np.float64]: """Calculate center of weight. Parameters ---------- coordinates : NDArray[np.float64] Coordinates of monomolecular. Returns ------- NDArray[np.float64] Center of weight. """ cen_of_weight = np.average(coordinates, axis=0) return np.round(cen_of_weight, decimals=10)
[docs] def convert_cart_to_frac(self, cart_coord: NDArray[np.float64]) -> NDArray[np.float64]: """Convert Cartesian coordinates to fractional coordinates. Parameters ---------- cart_coord : NDArray[np.float64] Cartesian coordinates. Returns ------- NDArray[np.float64] Fractional coordinates. """ a, b, c = self.cell_lengths alpha, beta, gamma = tuple(map(lambda x: np.radians(x), self.cell_angles)) b_x = -np.cos(gamma) / (a*np.sin(gamma)) b_y = 1 / (b*np.sin(gamma)) v = np.sqrt(1 - np.cos(alpha)**2 - np.cos(beta)**2 - np.cos(gamma)**2 + 2*np.cos(alpha)*np.cos(beta)*np.cos(gamma)) c_x = (np.cos(alpha)*np.cos(gamma) - np.cos(beta)) / (a*v*np.sin(gamma)) c_y = (np.cos(beta)*np.cos(gamma) - np.cos(alpha)) / (b*v*np.sin(gamma)) c_z = np.sin(gamma) / (c*v) vector = np.array(( (1/a, 0, 0), (b_x, b_y, 0), (c_x, c_y, c_z), )) return np.dot(cart_coord, vector)
[docs] def convert_frac_to_cart(self, frac_coord: NDArray[np.float64]) -> NDArray[np.float64]: """Convert fractional coordinates to Cartesian coordinates. Parameters ---------- frac_coord : NDArray[np.float64] Fractional coordinates. Returns ------- NDArray[np.float64] Cartesian coordinates. """ return np.dot(frac_coord, self.lattice)
[docs] def expand_mols( self, expand_range: int = 1 ) -> Dict[Tuple[int, int, int], Dict[int, List[Tuple[str, NDArray[np.float64]]]]]: """Generate molecules around unique molecules. Parameters ---------- expand_range : int The number of molecular cycles produced., by default 1 Returns ------- Dict[Tuple[int, int, int], Dict[int, List[Tuple[str, NDArray[np.float64]]]]] A nested dictionary containing the expanded molecular structure: - Outer key: Tuple[int, int, int] Represents the unit cell offset (i, j, k) relative to the origin unit cell. For example, (0, 0, 0) is the origin unit cell, (1, 0, 0) is one unit cell away in the a-direction, etc. - Inner key: int The index of the unique molecule within that unit cell. - Value: Tuple[List[str], NDArray[np.float64]] A list containing molecular information: - List[str]: Element symbols of the molecule - NDArray[np.float64]: Cartesian coordinates of the molecule (shape: (3, n)) """ expand_mols = {} combs = tuple(product(tuple(range(-expand_range, expand_range+1)), repeat=3)) for comb in combs: for i, unique_coord in self.unique_coords.items(): if i == 0: expand_mols[comb] = {i: [self.unique_symbols[i], unique_coord + np.array(comb)]} else: expand_mols[comb][i] = [self.unique_symbols[i], unique_coord + np.array(comb)] return expand_mols
[docs] def export_unit_cell_file(self, file_path: str, format: Literal['mol', 'xyz'] = 'mol') -> None: """export unit cell file Parameters ---------- file_path : str Path of the file to be saved. format : Literal['mol', 'xyz'] Format of the file to be saved. """ unit_cell_file = FileIO() for idx, symbols in self.unique_symbols.items(): unit_cell_file.add_symbols(symbols) unit_cell_file.add_coordinates(self.convert_frac_to_cart(self.unique_coords[idx])) unit_cell_file.add_adjacency_mat(self.sub_matrices[idx]) if format == 'mol' and unit_cell_file.atom_num > 999: format = 'xyz' file_path = file_path.replace('.mol', '.xyz') warnings.warn('The number of atoms is greater than 999. The file is saved as xyz format.') if format == 'mol': unit_cell_file.export_mol_file(file_path, header1=self.basename, header2="Generated by cif_reader.py") elif format == 'xyz': unit_cell_file.export_xyz_file(file_path, comment="Generated by cif_reader.py")
[docs] def remove_duplicates( self, symbol: List[str], coordinate: NDArray[np.float64], tol: float = 1e-4, ) -> Tuple[List[str], NDArray[np.float64]]: """Remove duplicates from symbol and coordinate arrays based on coordinate with a given tolerance. Parameters ---------- symbol : List[str] Symbols of molecules. coordinate : NDArray[np.float64] Coordinates of molecules. tol : float Tolerance for duplicate detection. Returns ------- Tuple[List[str], NDArray[np.float64]] Symbols and coordinates of unique molecules. """ distance_mat = ((coordinate[np.newaxis, :, :] - coordinate[:, np.newaxis, :]) ** 2).sum(axis=-1) dup = (distance_mat <= tol) dup = np.tril(dup, k=-1) unique_indices = ~dup.any(axis=-1) return symbol[unique_indices], coordinate[unique_indices]
[docs] class ElementPropertiesIsNotDefinedError(Exception): """Exception raised when element properties is not defined.""" pass
[docs] class SymmetryIsNotDefinedError(Exception): """Exception raised when symmetry is not defined.""" pass
[docs] class ZValueIsNotMatchError(Exception): """Exception raised when z value is not match.""" pass
[docs] class FileIO: def __init__(self) -> None: self.atom_num = 0 self.symbols_list = [] self.coordinates_list = [] self.adjacency_mat_list = []
[docs] def add_adjacency_mat(self, adjacency_mat: NDArray[bool]) -> None: """add adjacency matrix Parameters ---------- adjacency_mat : NDArray[bool] Adjacency matrix. """ self.adjacency_mat_list.append(adjacency_mat)
[docs] def add_coordinates(self, coordinates: NDArray[np.float64]) -> None: """add coordinates Parameters ---------- coordinates : NDArray[np.float64] Coordinates. """ self.coordinates_list.append(coordinates)
[docs] def add_symbols(self, symbols: List[str]) -> None: """add symbols Parameters ---------- symbols : List[str] Symbols. """ self.atom_num += len(symbols) self.symbols_list.append(symbols)
[docs] def export_mol_file( self, file_path: str, header1: str, header2: str, ) -> None: """export mol file Parameters ---------- file_path : str Path of the file to be saved. header1 : str Header line 1. header2 : str Header line 2. """ atom_lines = [] bond_lines = [] total_atoms = 0 total_bonds = 0 for i in range(len(self.symbols_list)): for s, (x, y, z) in zip(self.symbols_list[i], self.coordinates_list[i]): atom_lines.append(f"{x:10.4f}{y:10.4f}{z:10.4f} {s:<2s} 0\n") for j in range(len(self.symbols_list[i])): for k in range(j): if self.adjacency_mat_list[i][j, k] == 1: bond_lines.append(f"{j+total_atoms+1:3d}{k+total_atoms+1:3d} 1\n") total_bonds += np.int32(np.sum(np.tril(self.adjacency_mat_list[i], k=-1))) total_atoms += len(self.symbols_list[i]) with open(file_path, 'w') as f: f.write(f"{header1}\n") f.write(f"{header2}\n") f.write("\n") f.write(f"{total_atoms:3d}{total_bonds:3d} 0 0 0 0 0 0 0 0999 V2000\n") f.writelines(atom_lines) f.writelines(bond_lines) f.write("M END\n") f.write("$$$$\n")
[docs] def export_xyz_file( self, file_path: str, comment: str, ) -> None: """export xyz file Parameters ---------- file_path : str Path of the file to be saved. comment : str Comment. """ xyz_file_lines = [] total_atoms = 0 for i in range(len(self.symbols_list)): for s, (x, y, z) in zip(self.symbols_list[i], self.coordinates_list[i]): xyz_file_lines.append(f"{s:2s} {x:12.6f} {y:12.6f} {z:12.6f}\n") total_atoms += len(self.symbols_list[i]) with open(file_path, 'w') as f: f.write(f"{total_atoms:3d}\n") f.write(f"{comment}\n") f.writelines(xyz_file_lines)