Source code for hbat.utilities.atom_utils

"""
Atom Utilities

This module contains utility functions for working with PDB atoms and elements.
"""

import re
from typing import Dict

from ..constants.pdb_constants import _COMMON_PDB_ATOMS


[docs] def get_element_from_pdb_atom(atom_name: str) -> str: """ Map PDB atom name to chemical element using regex patterns. This function uses regular expressions to identify the element type from PDB atom naming conventions, handling complex cases like: - Greek letter remoteness indicators (CA, CB, CG, CD, CE, CZ, CH) - Numbered variants (C1', H2'', OP1, etc.) - Ion charges (CA2+, MG2+, etc.) - IUPAC hydrogen naming conventions :param atom_name: PDB atom name (e.g., 'CA', 'OP1', 'H2'', 'CA2+') :type atom_name: str :returns: Chemical element symbol (e.g., 'C', 'O', 'H', 'CA') :rtype: str Examples: >>> get_element_from_pdb_atom('CA') 'C' >>> get_element_from_pdb_atom('OP1') 'O' >>> get_element_from_pdb_atom('CA2+') 'CA' >>> get_element_from_pdb_atom('H2'') 'H' """ # Remove whitespace and convert to uppercase atom_name = atom_name.strip().upper() # Handle metal ions with charges first (CA2+, MG2+, etc.) metal_ion_match = re.match(r"^([A-Z]{1,2})[0-9]*[+-]$", atom_name) if metal_ion_match: return metal_ion_match.group(1) # Handle deuterium explicitly if atom_name == "D": return "D" # Handle hydrogen atoms (H followed by anything) if re.match(r"^H[A-Z0-9\'\"]*$", atom_name): return "H" # Handle carbon atoms (C followed by anything, but not CA2+ ions or CL/CO/CU) if ( re.match(r"^C[A-Z0-9\'\"]*$", atom_name) and not re.match(r"^CA[0-9]*[+-]$", atom_name) and atom_name not in ["CL", "CO", "CU"] ): return "C" # Handle nitrogen atoms (N followed by anything) if re.match(r"^N[A-Z0-9\'\"]*$", atom_name): return "N" # Handle oxygen atoms (O followed by anything) if re.match(r"^O[A-Z0-9\'\"]*$", atom_name): return "O" # Handle sulfur atoms (S followed by anything) if re.match(r"^S[A-Z0-9\'\"]*$", atom_name): return "S" # Handle phosphorus (P, possibly followed by numbers) if re.match(r"^P[0-9]*$", atom_name): return "P" # Handle specific single-letter elements single_element_map = { "F": "F", # Fluorine "CL": "CL", # Chlorine "BR": "BR", # Bromine "I": "I", # Iodine } if atom_name in single_element_map: return single_element_map[atom_name] # Handle common metal ions (without charges) metal_map = { "NA": "NA", # Sodium "MG": "MG", # Magnesium "K": "K", # Potassium "CA": "CA", # Calcium (when not followed by charge) "MN": "MN", # Manganese "FE": "FE", # Iron "CO": "CO", # Cobalt "NI": "NI", # Nickel "CU": "CU", # Copper "ZN": "ZN", # Zinc } if atom_name in metal_map: return metal_map[atom_name] # Default: try to extract first 1-2 letters as element # This handles unusual cases not covered by patterns above element_match = re.match(r"^([A-Z]{1,2})", atom_name) if element_match: return element_match.group(1) # Fallback: return the atom name as-is return atom_name
[docs] def pdb_atom_to_element(atom_name: str) -> str: """ High-performance mapping of PDB atom name to chemical element. Uses a pre-computed dictionary for common atoms and falls back to regex-based pattern matching for less common cases. :param atom_name: PDB atom name :type atom_name: str :returns: Chemical element symbol :rtype: str """ # Check common atoms first (faster lookup) if atom_name in _COMMON_PDB_ATOMS: return _COMMON_PDB_ATOMS[atom_name] # Fall back to regex-based matching return get_element_from_pdb_atom(atom_name)