Source code for wechrom.prepare_memory

import os
import pkg_resources
import pandas as pd
import numpy as np
from collections import defaultdict
from MDAnalysis import Universe
try:
    # for openmm version >= 7.6
    from openmm.app import PDBxFile
except:
    # for openmm version <= 7.5
    from simtk.openmm.app import PDBxFile
from .utils import _DNA_S1_TYPE, _DNA_S2_TYPE, _PRO_TYPE

_DNA_TYPES = [_DNA_S1_TYPE, _DNA_S2_TYPE]

[docs]def read_memory_template(template_list_dir = 'DEFAULT', template_list_filename = 'data/naked_database_list'): """Read the list of associative memory template filenames Args: template_list_dir (str, optional): directory of the template list file. Defaults to 'DEFAULT'. template_list (str, optional): name of the template list file. Defaults to 'data/naked_database_list'. Raises: Exception: custom template list is not supported right now Returns: pandas.DataFrame: fist column 'filename' and second column 'n_bp' """ if template_list_dir == 'DEFAULT': template_location = pkg_resources.resource_filename(__name__, template_list_filename) else: raise Exception("Custom memory template not implemented yet") memory_list = pd.read_csv(template_location, sep = '\s+') return memory_list
[docs]def generate_naked_memory_files(memory_dir = 'DEFAULT', memory_filename = 'naked.mem', memory_range = 4): """generate the file to store associative memory fragments Args: memory_dir (str, optional): Defaults to current workin directory. memory_filename (str, optional): Defaults to 'naked.mem'. memory_range (int, optional): Defaults to 4. """ print("Preparing the associative memory files......", end = ' ') if memory_dir == 'DEFAULT': memory_dir = os.getcwd() # Save the memory fragments in desired file memory_location = os.path.join(memory_dir, memory_filename) with open(memory_location, 'w') as mo: mo.write('[Target]\nnaked DNA\n[Memories]\n') mo.write('filename template_res memory_range\n') # Read the template dataframe memory_list = read_memory_template(template_list_filename = 'data/naked_database_list') # Generate memory fragments from the memory templates with open(memory_location, 'a') as mo: for _, row in memory_list.iterrows(): fragment = (row['n_bp'])//2 if fragment + memory_range < row['n_bp']: mo.write(f"{row['filename']} {fragment} {memory_range}\n") print("done")
[docs]def get_naked_memory_template_bonds(memory_list_dir = 'DEFAULT', memory_list_filename = 'naked.mem', database_location = 'data/memory_template/'): """Extract the positions of template associative memory fragments Args: memory_list_dir (str, optional): directory of the memory list. Defaults to current workin directory. memory_list_filename (str, optional): filename of the memory list. Defaults to 'naked.mem'. database_location (str, optional): location of the database. Defaults to 'data/memory_template/'. Returns: dict, 10 x 2 x memory_range x 3: positions of templates layer 1(10): template, keys 0-9 layer 2(2): dna type (strand), keys 'I', 'J' layer 3(np.array, memory_range x 3): positions of template residues in the range of memory_range """ if memory_list_dir == 'DEFAULT': memory_list_dir = os.getcwd() # read the associative memory list files memory_list_location = os.path.join(memory_list_dir, memory_list_filename) memory_list = pd.read_csv(memory_list_location, skiprows=4, sep='\s+', names=['filename', 'template_res', 'memory_range']) template_positions = {} # read each associative memory fragment for index, row in memory_list.iterrows(): fragment_filename = pkg_resources.resource_filename(__name__, database_location + row['filename']) # read the memory file template_positions[index] = {} frag = Universe(PDBxFile(fragment_filename)) # Read the bps within the memmory_length for each strand for dna_type in _DNA_TYPES: template_query = f"resid {row['template_res']}:{row['template_res'] + row['memory_range'] - 1} and name {dna_type}" template_atoms = frag.select_atoms(template_query) # get the positions template_positions[index][dna_type] = template_atoms.positions / 10 # MDanalysis default units Angstrom, convert to openmm default units nm return template_positions
[docs]def generate_nucleo_memory_files(memory_dir = 'DEFAULT', memory_filename = 'nucleo.mem', database_location = 'data/memory_template/', neighbor_radius = 100.0): """generate the file to store associative memory fragments for nucleosomes Args: memory_dir (str, optional): Defaults to current workin directory. memory_filename (str, optional): Defaults to 'nucleo.mem'. database_location (str, optional): Defaults to 'data/memory_template/'. neighbor_radius (float, optional): the radius to select neighbor contacts. in Angstrom for MDAnalysis. Defaults to 100.0 """ print("Preparing the nucleosome associative memory files......", end = ' ') if memory_dir == 'DEFAULT': memory_dir = os.getcwd() # Read the template dataframe memory_list = read_memory_template(template_list_filename = 'data/nucleo_database_list') # Save the memory fragments in desired file memory_location = os.path.join(memory_dir, memory_filename) with open(memory_location, 'w') as mo: mo.write('[Target]\nnucleosome\n[Memories]\n') mo.write('filename nucleo_center type1 res1 type2 res2_list\n') file_query = defaultdict(dict) file_center = {} # Generate memory fragments from the memory templates with open(memory_location, 'a') as mo: for _, row in memory_list.iterrows(): filename = row['filename'] nucleo_center = row['nucleo_center'] contacts = [int(x) for x in row['contacts'][1:-1].split(",")] dna_type = row['type'] # generate center memory (between hitone and contact DNA) type1 = _PRO_TYPE res1 = row['histone_res'] type2 = dna_type res2_list = contacts mo.write(f"{filename} {nucleo_center} {type1} {res1} {type2} [" + ",".join([str(int(x)) for x in res2_list]) + "]\n") # prepare for neighbor memory file_center[filename] = nucleo_center # prepare query to select contact atoms from the file file_query[filename][dna_type] = f"name {dna_type} and (" for res_id in contacts: file_query[filename][dna_type] += f"resid {res_id + 1} or " #openmm index starts from 0, mda from 1 file_query[filename][dna_type] = file_query[filename][dna_type][:-4] + ")" for filename in file_query: # go through memory files one by one fragment_filename = pkg_resources.resource_filename(__name__, database_location + filename) frag = Universe(PDBxFile(fragment_filename)) # select atoms for the two strands dna_atoms = {} for dna_type in _DNA_TYPES: dna_atoms[dna_type] = frag.select_atoms(file_query[filename][dna_type]) for res1_type in _DNA_TYPES: for res2_type in _DNA_TYPES: res1_res2_set = defaultdict(set) res1_atoms = dna_atoms[res1_type] res2_atoms= dna_atoms[res2_type] for atom1 in res1_atoms: for atom2 in res2_atoms: res1_id = atom1.resid-1 #openmm index from 0, mdanalysis index from 1 res2_id = atom2.resid-1 if res2_id < res1_id: continue if res2_id == res1_id and res1_type == res2_type: continue pos1 = atom1.position pos2 = atom2.position rm = np.linalg.norm(pos1 - pos2) if rm < neighbor_radius: res1_res2_set[res1_id].add(res2_id) for res1_id in res1_res2_set: res1_res2_set[res1_id] = sorted(res1_res2_set[res1_id]) res2_list = "["+",".join([str(int(x)) for x in res1_res2_set[res1_id]])+"]" with open(memory_location, 'a') as mo: mo.write(f"{filename} {file_center[filename]} {res1_type} {res1_id} {res2_type} {res2_list}\n") print("done")
[docs]def get_nucleo_memory_template_bonds(memory_list_dir = 'DEFAULT', memory_list_filename = 'nucleo.mem', database_location = 'data/memory_template/'): """Extract the positions of template associative memory fragments Args: memory_list_dir (str, optional): directory of the memory list. Defaults to current workin directory. memory_list_filename (str, optional): filename of the memory list. Defaults to 'nucleo.mem'. database_location (str, optional): location of the database. Defaults to 'data/memory_template/'. Returns: dict, 2 x N: lengths of center templates bonds, 1st layer is type, 2nd layer is relative res_id dict, 2 x 2 x N1 x N2: lengths of neighbor templates bonds between the pair of (type, res_id) atoms """ if memory_list_dir == 'DEFAULT': memory_list_dir = os.getcwd() # read the associative memory list files memory_list_location = os.path.join(memory_list_dir, memory_list_filename) memory_list = pd.read_csv(memory_list_location, skiprows=4, sep='\s+', names=['filename', 'nucleo_center', 'type1', 'res1', 'type2', 'res2_list']) # bonds dict initialization centerBonds = {} for dna_type in _DNA_TYPES: centerBonds[dna_type] = {} neighborBonds = {} for dna_type1 in _DNA_TYPES: neighborBonds[dna_type1] = {} for dna_type2 in _DNA_TYPES: neighborBonds[dna_type1][dna_type2] = defaultdict(dict) # read each associative memory fragment old_filename = None for index, row in memory_list.iterrows(): if old_filename != row['filename']: fragment_filename = pkg_resources.resource_filename(__name__, database_location + row['filename']) # read the memory file frag = Universe(PDBxFile(fragment_filename)) old_filename = row['filename'] nucleo_zero = row['nucleo_center'] type1 = row['type1'] res1 = row['res1'] type2 = row['type2'] res2_list = [int(x) for x in row['res2_list'][1:-1].split(",")] atom1 = frag.select_atoms(f"name {type1} and resid {res1 + 1}")[0] #openmm index starts from 0, mda from 1 pos1 = atom1.position / 10 # MDanalysis default units Angstrom, openmm default units nm for res2 in res2_list: atom2 = frag.select_atoms(f"name {type2} and resid {res2 + 1}")[0] #openmm index starts from 0, mda from 1 pos2 = atom2.position / 10 # MDanalysis default units Angstrom, openmm default units nm rm = np.linalg.norm(pos1 - pos2) if type1 == _PRO_TYPE: centerBonds[type2][res2 - nucleo_zero] = rm else: neighborBonds[type1][type2][res1 - nucleo_zero][res2 - nucleo_zero] = rm return centerBonds, neighborBonds