new script alexa

Former-commit-id: e36ec1fe4be85cdf789737bc3123acf1809d6ea7

new script alexa
a6a52657 · Fabien MAREUIL · 7f8036d0 · a6a52657
Commit a6a52657 authored 7 years ago by Fabien MAREUIL
--- a/pyScripts/get_interface_residues_ppc_dimer_IN_PROGRESS.py
+++ b/pyScripts/get_interface_residues_ppc_dimer_IN_PROGRESS.py
+# -*- coding: utf-8 -*-
+
+"""
+Author: Alexandra Moine-Franel
+Date: April 2018
+Version: 2
+
+
+
+1) IDENTIFY PROTEIN/PROTEIN INTERFACE RESIDUES
+
+2) SAVE EACH PROTEIN FIRST CHAIN IN SEPARATE FILE (after cleaning)
+	- Number of models
+	- Alternative atomic locations
+	- Heteroatoms
+
+
+
+
+Usage:	[Script.py] [PROTEIN/PROTEIN COMPLEX] -d (DISTANCE)
+-------------------------------------------------------------------------------
+
+Arguments:
+------------
+[PROTEIN/PROTEIN COMPLEX]:
+	Input (.pdb format)
+	Protein/protein complex
+
+(DISTANCE)- optionnal
+	Input (float or integer)
+	Distance threshold between the target and the partner
+	(6 angstroms - by defaut)
+
+
+Return:
+------------
+[PROTEIN/PROTEIN INTERFACE RESIDUES]: 
+	Output (.txt format) "PDB_Chain1-Chain2_distance.txt"
+	(chain ID, residue name and residue ID included)
+
+[PROTEIN CHAIN]: 
+	Output (.pdb format) "PDB_Chain1.pdb"
+	(heteroatom not included)
+"""
+
+# =============================================================================
+
+import os
+import sys
+import argparse
+import textwrap
+import logging
+import time
+
+from Bio.PDB import Select, PDBIO
+from Bio.PDB.PDBParser import PDBParser
+import itertools
+
+# =============================================================================
+
+LOG = logging.getLogger("IDENTIFY PROTEIN/PROTEIN INTERFACE RESIDUES")
+
+MODEL_FILENAME = 'pdb_multiple_models'
+ALTLOC_FILENAME = 'pdb_altloc'
+NODIMER_FILENAME = 'pdb_not_dimer'
+
+# =============================================================================
+
+def main(pdb, distance):
+	start = time.time()
+
+	pdb_code = file.strip().split('.pdb')[:-1]
+	structure = PDBParser().get_structure(pdb_code, file)
+	model = structure[0]
+
+	# Check if multiple models available
+	if len(list(structure.get_models())) != 0:
+		try:
+			if not os.path.exists('{}.txt'.format(MODEL_FILENAME)):
+				with open('{}.txt'.format(MODEL_FILENAME), 'w') as outfile:
+					outfile.write(pdb_code)
+			else:
+				with open('{}.txt'.format(MODEL_FILENAME), 'a') as outfile:
+					outfile.write(str(pdb_code) + '\n')
+		except IOError, e:
+			LOG.error('{}'.format(e))
+			sys.exit(1)
+			
+	# Check if 3D structure contains alternative atomic locations
+	if get_altloc(model) != 0: 
+		try:
+			if not os.path.exists('{}.txt'.format(ALTLOC_FILENAME)):
+				with open('{}.txt'.format(ALTLOC_FILENAME), 'w') as outfile:
+					outfile.write(str(pdb_code) + '\n')
+			else:
+				with open('{}.txt'.format(ALTLOC_FILENAME), 'a') as outfile:
+					outfile.write(str(pdb_code) + '\n')
+		except IOError, e:
+			LOG.error('{}'.format(e))
+			sys.exit(1)	
+	else:
+
+		# Get the first copy of each chain
+		target_partner_chain = select_first_chain(get_protein_assembly(structure))
+		print(target_partner_chain)
+
+		# Check if 3D structure contains more than two molecule IDs
+		if len(target_partner_chain) > 2:
+			try:
+				if not os.path.exists('{}.txt'.format(NODIMER_FILENAME_FILENAME)):
+					with open('{}.txt'.format(NODIMER_FILENAME_FILENAME), 'w') as outfile:
+						outfile.write(pdb_code)
+				else:
+					with open('{}.txt'.format(NODIMER_FILENAME_FILENAME), 'a') as outfile:
+						outfile.write(str(pdb_code) + '\n')
+			except IOError, e:
+				LOG.error('{}'.format(e))
+				sys.exit(1)
+
+		
+		# Remove heteroatom (protein/protein complex, no ligand)
+		for chain in target_partner_chain:
+			residue_to_remove = remove_hetatm(model, chain)
+			for residue in residue_to_remove:
+				model[chain].detach_child(residue[1])
+
+
+		# Get all chain-chain combinations
+		subset = permutation(target_partner_chain)
+
+
+		# Calculate chain-chain distance (interaction patch)
+		for chain_pair in subset:
+			target = chain_pair[0]
+			partner = chain_pair[1]	
+
+			if len(model[target].get_list()) > 3 and \
+			len(model[partner].get_list()) > 3:
+			
+				get_interface_residues(target, partner, model, distance)
+
+
+		# Save each chain in different file
+		for chain in target_partner_chain:
+			io = PDBIO()
+			io.set_structure(model)
+			io.save('{}_{}.pdb'.format(pdb_code, chain), select = ChainSelection(chain), \
+			preserve_atom_numbering = True)
+
+			
+			
+		end = time.time()
+		print('TIME: {}'.format(end - start))
+
+
+class ChainSelection(Select):
+	def __init__(self, chain):
+		self.chain = chain
+
+	def accept_chain(self, chain):
+		if chain.get_id() == self.chain:
+			return 1
+		else:
+			return 0
+
+
+def get_altloc(model):
+	altloc = 0
+	for chain in model.child_list:
+		for residue in chain.get_list():
+			if residue.is_disordered():
+				altloc = altloc + 1
+				break
+	return altloc
+
+
+def get_protein_assembly(structure):
+	assembly = {}
+	for molecule in structure.header['compound'].keys():
+		assembly[molecule] = []
+		for chain in structure.header['compound'][molecule]['chain'].split(','):
+			assembly[molecule].append(chain.strip().upper())
+	return assembly
+	# {'1': ['A', 'C', 'E'], '2': ['B', 'D']}
+
+
+def select_first_chain(assembly):
+	chain_id = []
+	for molecule in assembly.keys():
+		chain_id.append(assembly[molecule][0])
+	return chain_id
+	# ['A', 'B']
+
+
+def permutation(list_id):
+	subset = []
+	for permutation in itertools.permutations(list_id, 2):
+		subset.append(permutation)
+	print subset
+	return subset
+	# [(['A'], ['B']), (['B'], ['A'])]
+
+
+def remove_hetatm(model, chain):
+	residue_to_remove = []
+	for residue in model[chain].get_residues():
+		if residue.id[0] != ' ': 
+			residue_to_remove.append((model[chain].id, residue.id))
+	return residue_to_remove
+
+
+
+def get_interface_residues(target, partner, model, distance):
+	ires = []
+	with open('{}_{}{}_{}.txt'.format(model.parent.id, \
+	target, partner, distance), 'wb') as outfile:			
+ 
+			for rest in model[target].get_residues():
+				if ''.join(map(str,(rest.get_parent().get_id(), '.',
+				rest.get_resname(), rest.get_id()[1]))):
+					for atomt in rest.get_atoms():
+						
+						for atomp in model[partner].get_atoms():
+							xt = atomt.get_coord()[0]
+							xp = atomp.get_coord()[0]
+							dist_x = calculate_euclidist_x(xt, xp) 
+
+							if dist_x < float(distance):
+								yt = atomt.get_coord()[1]
+								yp = atomp.get_coord()[1]
+								dist_xy = calculate_euclidist_xy(xt, yt, xp, yp)
+
+								if dist_xy < float(distance):
+									zt = atomt.get_coord()[2]
+									zp = atomp.get_coord()[2]
+									dist_xyz = calculate_euclidist_xyz(xt, yt, zt, xp, yp, zp)
+
+									if dist_xyz < float(distance):
+										ires.append(''.join(map(str, \
+										(rest.get_parent().get_id(), '.', \
+										rest.get_resname(), rest.get_id()[1])))
+
+	for element in ires:
+			outfile.write(element + '\n')
+
+
+
+def calculate_euclidist_x(x1, x2):
+	"""
+	Calculate the distance x between two 1D points
+	----------------------------------------------------------------------
+	Arguments:
+		[float]: coordinate x of target atom
+				 coordinate x of partnet atom
+	Return:
+		[float]: distance 
+	"""
+	dist = ((x1-x2)**2) ** 0.5
+	return dist
+
+
+def calculate_euclidist_xy(x1, y1, x2, y2):
+	"""
+	Calculate the distance xy between two 2D points
+	----------------------------------------------------------------------
+	Arguments:
+		[float]: coordinates xy of target atom
+				 coordinates xy of partnet atom
+	Return:
+		[float]: distance 
+	"""
+	dist = ((x1-x2)**2 + (y1-y2)**2) ** 0.5	
+	return dist
+
+	
+def calculate_euclidist_xyz(x1, y1, z1, x2, y2, z2):
+	"""
+	Calculate the euclidian distance xyz between two 3D points
+	----------------------------------------------------------------------
+	Arguments:
+		[float]: coordinates xyz of target atom
+				 coordinates xyz of partnet atom
+	Return:
+		[float]: distance 
+	"""
+	dist = ((x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2) ** 0.5
+	return dist
+
+
+
+def setlogger():
+	LOG.setLevel(logging.INFO)
+	ch = logging.StreamHandler()
+	ch.setLevel(logging.INFO)
+	formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - \
+	%(levelname)s - %(message)s')
+	ch.setFormatter(formatter)
+	LOG.addHandler(ch)
+
+# =============================================================================
+
+if __name__ == '__main__':
+	parser = argparse.ArgumentParser(description = 
+	textwrap.dedent('''
+	1) Format protein/protein complex PDB structure (no ligand)
+		1.a) Number of models
+		1.b) Alternative atomic locations
+		1.c) Heteroatoms
+	2) Identify protein/protein interface residues 
+	   (for combinations of first chain of each protein)
+	3) Save each chain in separate files''')
+	parser.add_argument('pdb', help = 
+	textwrap.dedent(''' Input [.pdb file]: PBD 3D structure
+	(with HEADER included)'''))
+	parser.add_argument('-d', dest = 'distance', help = 
+	textwrap.dedent('''Input [float or integer]: distance threshold \
+	{by default, 6 Angstroms}''')
+	options = parser.parse_args()
+	
+	if options.distance is None:
+		options.distance = 6.0
+				
+	setlogger()
+	main(options.file, options.distance)