Source code for hbat.ccd.ccd_analyzer

"""
Chemical Component Dictionary (CCD) BinaryCIF Data Analyzer

This module provides efficient parsing and lookup functionality for CCD BinaryCIF files,
with automatic download capabilities and in-memory data structures optimized for
fast atom and bond lookups by residue and atom IDs.
"""

import os
import urllib.request
from typing import Dict, List, Optional, Set, Tuple, Union

import pandas as pd
from mmcif.io.BinaryCifReader import BinaryCifReader


[docs] class CCDDataManager: """ Manages Chemical Component Dictionary data with efficient lookup capabilities. This class handles automatic download of CCD BinaryCIF files and provides optimized in-memory data structures for fast lookups of atoms and bonds by component ID and atom ID. """
[docs] def __init__(self, ccd_folder: Optional[str] = None): """ Initialize the CCD data manager. Args: ccd_folder: Path to folder for storing CCD BinaryCIF files. If None, uses the user's ~/.hbat/ccd-data directory. """ if ccd_folder is None: # Import here to avoid circular imports from ..core.app_config import get_hbat_config config = get_hbat_config() self.ccd_folder = config.get_ccd_data_path() else: self.ccd_folder = ccd_folder self.atom_file = os.path.join(self.ccd_folder, "cca.bcif") self.bond_file = os.path.join(self.ccd_folder, "ccb.bcif") # Data storage - loaded on demand self._atoms_data: Optional[Dict[str, List[Dict]]] = None self._bonds_data: Optional[Dict[str, List[Dict]]] = None self._atom_lookup: Optional[Dict[Tuple[str, str], Dict]] = None self._bond_lookup: Optional[Dict[str, List[Dict]]] = None # File URLs self.atom_url = "https://models.rcsb.org/cca.bcif" self.bond_url = "https://models.rcsb.org/ccb.bcif"
[docs] def ensure_files_exist(self) -> bool: """ Ensure CCD BinaryCIF files exist, downloading if necessary. Returns: True if files are available, False if download failed """ # Create directory if it doesn't exist os.makedirs(self.ccd_folder, exist_ok=True) # Update configuration that we're checking for CCD files try: from ..core.app_config import get_hbat_config config = get_hbat_config() except ImportError: config = None files_to_check = [ (self.atom_file, self.atom_url, "atom data"), (self.bond_file, self.bond_url, "bond data"), ] for file_path, url, description in files_to_check: if not os.path.exists(file_path): print(f"Downloading CCD {description} from {url}...") try: urllib.request.urlretrieve(url, file_path) print(f"Successfully downloaded {description} to {file_path}") # Update config with successful download if config: from datetime import datetime config.update_ccd_status(True, datetime.now().isoformat()) except Exception as e: print(f"Error downloading {description}: {e}") return False else: print(f"Found existing {description} at {file_path}") # Update config that files are present if config: config.update_ccd_status(True) return True
def _read_bcif_file(self, file_path: str) -> List: """ Read a BinaryCIF file and return the data containers. Args: file_path: Path to the BinaryCIF file Returns: List of data containers from the BinaryCIF file """ try: reader = BinaryCifReader() data = reader.deserialize(file_path) return data if isinstance(data, list) else [data] except Exception as e: print(f"Error reading {file_path}: {e}") return []
[docs] def load_atoms_data(self) -> bool: """ Load and parse atom data from CCD BinaryCIF file into memory. Returns: True if successful, False otherwise """ if self._atoms_data is not None: return True # Already loaded if not self.ensure_files_exist(): return False print("Loading atom data into memory...") data_containers = self._read_bcif_file(self.atom_file) if not data_containers: print("No atom data containers found") return False # Initialize data structures atoms_by_comp: Dict[str, List[Dict]] = {} atom_lookup: Dict[Tuple[str, str], Dict] = {} total_atoms = 0 for container in data_containers: if hasattr(container, "getObjNameList"): obj_names = container.getObjNameList() for obj_name in obj_names: if "chem_comp_atom" in obj_name: obj = container.getObj(obj_name) if obj: attr_names = obj.getAttributeList() row_count = obj.getRowCount() for i in range(row_count): atom_data = {} for attr in attr_names: atom_data[attr] = obj.getValue(attr, i) comp_id = atom_data.get("comp_id", "") atom_id = atom_data.get("atom_id", "") # Store in component-grouped structure if comp_id not in atoms_by_comp: atoms_by_comp[comp_id] = [] atoms_by_comp[comp_id].append(atom_data) # Store in lookup structure atom_lookup[(comp_id, atom_id)] = atom_data total_atoms += 1 break # Only process first atom object self._atoms_data = atoms_by_comp self._atom_lookup = atom_lookup print(f"Loaded {total_atoms} atoms for {len(atoms_by_comp)} components") return True
[docs] def load_bonds_data(self) -> bool: """ Load and parse bond data from CCD BinaryCIF file into memory. Returns: True if successful, False otherwise """ if self._bonds_data is not None: return True # Already loaded if not self.ensure_files_exist(): return False print("Loading bond data into memory...") data_containers = self._read_bcif_file(self.bond_file) if not data_containers: print("No bond data containers found") return False # Initialize data structures bonds_by_comp: Dict[str, List[Dict]] = {} bond_lookup: Dict[str, List[Dict]] = {} total_bonds = 0 for container in data_containers: if hasattr(container, "getObjNameList"): obj_names = container.getObjNameList() for obj_name in obj_names: if "chem_comp_bond" in obj_name: obj = container.getObj(obj_name) if obj: attr_names = obj.getAttributeList() row_count = obj.getRowCount() for i in range(row_count): bond_data = {} for attr in attr_names: bond_data[attr] = obj.getValue(attr, i) comp_id = bond_data.get("comp_id", "") # Store in component-grouped structure if comp_id not in bonds_by_comp: bonds_by_comp[comp_id] = [] bonds_by_comp[comp_id].append(bond_data) # Store in lookup structure (for future atom-based lookups) if comp_id not in bond_lookup: bond_lookup[comp_id] = [] bond_lookup[comp_id].append(bond_data) total_bonds += 1 break # Only process first bond object self._bonds_data = bonds_by_comp self._bond_lookup = bond_lookup print(f"Loaded {total_bonds} bonds for {len(bonds_by_comp)} components") return True
[docs] def get_component_atoms(self, comp_id: str) -> List[Dict]: """ Get all atoms for a specific component. Args: comp_id: Component identifier (e.g., 'ALA', 'GLY') Returns: List of atom dictionaries for the component """ if not self.load_atoms_data(): return [] return self._atoms_data.get(comp_id, [])
[docs] def get_component_bonds(self, comp_id: str) -> List[Dict]: """ Get all bonds for a specific component. Args: comp_id: Component identifier (e.g., 'ALA', 'GLY') Returns: List of bond dictionaries for the component """ if not self.load_bonds_data(): return [] return self._bonds_data.get(comp_id, [])
[docs] def get_atom_by_id(self, comp_id: str, atom_id: str) -> Optional[Dict]: """ Get a specific atom by component and atom ID. Args: comp_id: Component identifier atom_id: Atom identifier Returns: Atom dictionary if found, None otherwise """ if not self.load_atoms_data(): return None return self._atom_lookup.get((comp_id, atom_id))
[docs] def get_bonds_involving_atom(self, comp_id: str, atom_id: str) -> List[Dict]: """ Get all bonds involving a specific atom. Args: comp_id: Component identifier atom_id: Atom identifier Returns: List of bond dictionaries involving the atom """ bonds = self.get_component_bonds(comp_id) return [ bond for bond in bonds if bond.get("atom_id_1") == atom_id or bond.get("atom_id_2") == atom_id ]
[docs] def get_available_components(self) -> Set[str]: """ Get set of all available component IDs. Returns: Set of component identifiers """ components = set() if self.load_atoms_data(): components.update(self._atoms_data.keys()) if self.load_bonds_data(): components.update(self._bonds_data.keys()) return components
[docs] def get_component_summary(self, comp_id: str) -> Dict: """ Get summary information for a component. Args: comp_id: Component identifier Returns: Dictionary with component summary """ atoms = self.get_component_atoms(comp_id) bonds = self.get_component_bonds(comp_id) # Count bond orders and aromatic bonds bond_orders = {} aromatic_count = 0 for bond in bonds: order = bond.get("value_order", "unknown") bond_orders[order] = bond_orders.get(order, 0) + 1 if bond.get("pdbx_aromatic_flag", "N") == "Y": aromatic_count += 1 return { "component_id": comp_id, "atom_count": len(atoms), "bond_count": len(bonds), "bond_orders": bond_orders, "aromatic_bonds": aromatic_count, "atoms": [atom.get("atom_id", "") for atom in atoms], "available": len(atoms) > 0 or len(bonds) > 0, }
[docs] def extract_residue_bonds_data(self, residue_list: List[str]) -> Dict[str, Dict]: """ Extract bond information for a list of residues in a format suitable for constants generation. Args: residue_list: List of residue codes to extract data for Returns: Dictionary mapping residue codes to their bond information """ if not self.load_bonds_data(): return {} residue_bonds = {} print(f"Extracting bond data for {len(residue_list)} residues...") for residue in residue_list: bonds_raw = self.get_component_bonds(residue) if bonds_raw: # Convert to standardized format bonds = [] for bond in bonds_raw: bond_info = { "atom1": bond.get("atom_id_1", ""), "atom2": bond.get("atom_id_2", ""), "order": bond.get("value_order", "unknown"), "aromatic": bond.get("pdbx_aromatic_flag", "N") == "Y", } bonds.append(bond_info) # Count bond orders bond_orders = {} for bond in bonds_raw: order = bond.get("value_order", "unknown") bond_orders[order] = bond_orders.get(order, 0) + 1 residue_bonds[residue] = { "bonds": bonds, "bond_count": len(bonds), "aromatic_bonds": len([b for b in bonds if b["aromatic"]]), "bond_orders": bond_orders, } print(f" {residue}: {len(bonds)} bonds") else: print(f" {residue}: No bond data found") print(f"Successfully extracted data for {len(residue_bonds)} residues") return residue_bonds