diff --git a/pyScripts/get_pdb_structure.py b/pyScripts/get_pdb_structure.py deleted file mode 100644 index d5dd27838698a9550b60550d5b1d87dab7252d36..0000000000000000000000000000000000000000 --- a/pyScripts/get_pdb_structure.py +++ /dev/null @@ -1,80 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Author: Alexandra Moine-Franel -Date: March 2018 -Version: 1 - - -DOWNLOAD PDB 3D STRUCTURE FILES FROM PDB - - -Usage: [Script.py] [pdb_id] ------------------------------------------------------------------- -Argument: - [file]: list of PDB IDs (txt format) - -Return: - [file]: PDB files, stored in 'PDB' folder -""" - -# ============================================================================= - -import os -import argparse -import logging -import csv -import Bio -from Bio.PDB import PDBList - -# ============================================================================= - -LOG = logging.getLogger("DOWNLOAD PDB 3D STRUCTURE FILES") - -FOLDER = 'PDB' - -# ============================================================================= - -def main(pdb_id): - get_pdb_file(get_pdbID(pdb_id)) - rename_pdb_file() - - -def get_pdbID(txtfile): - pdb_id = [] - with open(txtfile, 'rb') as pdb_txtfile: - for line in csv.reader(pdb_txtfile): - pdb_id.append(line[0].upper()) - return pdb_id - - -def get_pdb_file(pdbID): - for i in pdbID: - PDBList().retrieve_pdb_file(i, pdir = FOLDER, file_format = 'pdb') - - -def rename_pdb_file(): - working_dir = os.path.dirname(__file__) - pdb_dir = os.path.abspath(os.path.join(working_dir, FOLDER)) - os.chdir(pdb_dir) - for filename in os.listdir(pdb_dir): - os.rename(filename, filename.replace('pdb', '').replace('ent', 'pdb')) - - -def setlogger(): - LOG.setLevel(logging.INFO) - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s') - ch.setFormatter(formatter) - LOG.addHandler(ch) - -# ============================================================================= - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description = - 'DOWNLOAD PDB 3D STRUCTURE FILES FROM PDB') - parser.add_argument('pdb_id', help = 'Input [.txt file]: PDB IDs') - options = parser.parse_args() - setlogger() - main(options.pdb_id) diff --git a/pyScripts/get_ppc_V2.py b/pyScripts/get_ppc_V2.py deleted file mode 100644 index 6d801b9c9a1e30ed70137e206681968e4bca6a66..0000000000000000000000000000000000000000 --- a/pyScripts/get_ppc_V2.py +++ /dev/null @@ -1,171 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Author: Alexandra Moine-Franel -Date: March 2018 -Version: 2 - - - -IDENTIFY PROTEIN-PROTEIN COMPLEXES FROM PDBe - - -Usage: [Script.py] [PDBe] ------------------------------------------------------------------- -Argument: - [PDBe]: - Input (json format) - PDBe (with [assembly_composition] and [assembly_type] information included) - -Return: - [list(PPC)]: - Output (txt format) - Protein/protein complex PDB IDs - - [json(PPC)]: - Output (json format) - Protein/protein complex information - - -Warning: -------- -PDBs with unique and repeated assembly composition annotation are both included! -[unique annotation: 'protein/protein complex'] -[repeated annotation: 'protein/protein complex, protein/protein complex'] -""" - -# ============================================================================= - -import os -import sys -import argparse -import logging -import json - -# ============================================================================= - -LOG = logging.getLogger('Get PPC PDB IDs from PDBe') -FILENAME = 'PPC-DIMER_3-26-18PDBe' - -XRAY = ['X-ray diffraction', 'X-ray powder diffraction'] - -RESOLUTION = 3.5 -DIFFR = 0.05 - -# ============================================================================= - -def main(pdbe): - with open(pdbe) as infile: - structure = json.load(infile) - dimer = get_dimer(get_ppc(structure)) - dimer_filt = filter_rfactor(filter_resolution(filter_annotation(dimer))) - to_json(dimer_filt, FILENAME) - to_txt(get_id(dimer_filt), FILENAME) - - if os.path.exists('{}.txt'.format(FILENAME)): - LOG.info('Finished!') - else: - LOG.error('Warning! [Output] Not found!') - sys.exit(1) - - - -def get_ppc(pdbe): - ppc_unique = [] - ppc_repeated = [] - for i in xrange(len(pdbe)): - if 'assembly_composition' in pdbe[i].keys(): - assembly_composition = pdbe[i]['assembly_composition'] - if len(assembly_composition) == 1 and \ - assembly_composition == ['protein/protein complex']: - ppc_unique.append(pdbe[i]) - elif len(assembly_composition) > 1: - if assembly_composition.count('protein/protein complex') >= 1 and \ - assembly_composition.count( - assembly_composition[0]) == len(assembly_composition): - ppc_repeated.append(pdbe[i]) - ppc_all = ppc_unique + ppc_repeated - return ppc_all - - -def get_dimer(pdbe): - dimer_unique = [] - dimer_repeated = [] - for i in xrange(len(pdbe)): - if 'assembly_type' in pdbe[i].keys(): - assembly_type = pdbe[i]['assembly_type'] - if len(assembly_type) == 1 and \ - assembly_type == ['dimer']: - dimer_unique.append(pdbe[i]) - elif len(assembly_type) > 1: - if assembly_type.count('dimer') >= 1 and \ - assembly_type.count(assembly_type[0]) == len(assembly_type): - dimer_repeated.append(pdbe[i]) - dimer_all = dimer_unique + dimer_repeated - return dimer_all - - -def filter_annotation(pdbe): - annotation = [] - for i in xrange(len(pdbe)): - if len(pdbe[i]['experimental_method']) == 1: - annotation.append(pdbe[i]) - return annotation - -def filter_resolution(pdbe): - quality = [] - for i in xrange(len(pdbe)): - if len([e for e in pdbe[i]['experimental_method'] if e in XRAY]) == 1: - if 'resolution' in pdbe[i].keys() and \ - float(pdbe[i]['resolution']) <= RESOLUTION: - quality.append(pdbe[i]) - return quality - -def filter_rfactor(pdbe): - quality = [] - for i in xrange(len(pdbe)): - if len([e for e in pdbe[i]['experimental_method'] if e in XRAY]) == 1: - if 'r_free' in pdbe[i].keys() and 'r_factor' in pdbe[i].keys(): - if float(pdbe[i]['r_free'] - pdbe[i]['r_factor']) <= abs(DIFFR): - quality.append(pdbe[i]) - return quality - - -def get_id(pdbe): - pdb_id = [] - for i in xrange(len(pdbe)): - pdb_id.append(pdbe[i]['pdb_id']) - return pdb_id - - -def to_json(data, outfile): - with open('{}.json'.format(outfile), 'wb') as jsonfile: - json.dump(data, jsonfile) - - -def to_txt(data, outfile): - with open('{}.txt'.format(outfile), 'wb') as txtfile: - for pdb_id in data: - txtfile.write(pdb_id + '\n') - - - -def setlogger(): - LOG.setLevel(logging.INFO) - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - \ - %(levelname)s - %(message)s') - ch.setFormatter(formatter) - LOG.addHandler(ch) - -# ============================================================================= - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description = 'Get protein/protein \ - complex PDB IDs from PDBe') - parser.add_argument('pdbe', help = 'Input [.json file]: \ - PBDe (with assembly and structure quality information included') - options = parser.parse_args() - setlogger() - main(options.pdbe) diff --git a/pyScripts/get_ppc_id_from_pdbe.py b/pyScripts/get_ppc_id_from_pdbe.py deleted file mode 100644 index e41157a8c94a05634d5bcdfaea385670bb4a5abc..0000000000000000000000000000000000000000 --- a/pyScripts/get_ppc_id_from_pdbe.py +++ /dev/null @@ -1,121 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Author: Alexandra Moine-Franel -Date: February 2018 -Version: 1 - - - -IDENTIFY PROTEIN-PROTEIN COMPLEXES FROM PDBe - - -Usage: [Script.py] [PDBe] ------------------------------------------------------------------- -Argument: - [PDBe]: - Input (json format) - PDBe (with assembly information included) - -Return: - [list(PPC)]: - Output (txt format) - Protein/protein complex PDB IDs - - [json(PPC)]: - Output (json format) - Protein/protein complex information - - -Warning: -------- -PDBs with unique and repeated assembly composition annotation are both included! -[unique annotation: 'protein/protein complex'] -[repeated annotation: 'protein/protein complex, protein/protein complex'] -""" - -# ============================================================================= - -import argparse -import logging -import json -import pandas as pd - -# ============================================================================= - -LOG = logging.getLogger('Get protein/protein complex PDB IDs from PDBe') - -# ============================================================================= - -def setlogger(): - LOG.setLevel(logging.INFO) - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s') - ch.setFormatter(formatter) - LOG.addHandler(ch) - - - -def main(pdbe): - structure = pd.read_json(pdbe, orient = 'columns') - ppc_to_txt(get_ppc_id(structure)) - ppc_to_json(get_ppc_data(structure)) - - - -def get_ppc_id(pdbe): - ppc_unique = [] - ppc_repeated = [] - for i in xrange(len(pdbe['grouped']['pdb_id']['groups'])): - assembly_composition = pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['assembly_composition'] - if len(assembly_composition) == 1 and assembly_composition == ['protein/protein complex']: - ppc_unique.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['pdb_id']) - elif len(assembly_composition) > 1: - if assembly_composition.count('protein/protein complex') >= 1 and \ - assembly_composition.count(assembly_composition[0]) == len(assembly_composition): - ppc_repeated.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['pdb_id']) - ppc_all = ppc_unique + ppc_repeated - return ppc_all - - - -def get_ppc_data(pdbe): - ppc_unique_data = [] - ppc_repeated_data = [] - for i in xrange(len(pdbe['grouped']['pdb_id']['groups'])): - assembly_composition = pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['assembly_composition'] - if len(assembly_composition) == 1 and assembly_composition == ['protein/protein complex']: - ppc_unique_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) - elif len(assembly_composition) > 1: - if assembly_composition.count('protein/protein complex') >= 1 and \ - assembly_composition.count(assembly_composition[0]) == len(assembly_composition): - ppc_repeated_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) - ppc_all_data = ppc_unique_data + ppc_repeated_data - return ppc_all_data - - - -def ppc_to_txt(ppc): - with open('PPC_2-20-18PDBe.txt', 'wb') as txtfile: - for pdb_id in ppc: - txtfile.write(pdb_id + '\n') - - - -def ppc_to_json(ppc): - with open('PPC_2-20-18PDBe.json', 'wb') as jsonfile: - json.dump(ppc, jsonfile) - - - -# ============================================================================== - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description = - 'Get protein/protein complex PDB IDs from PDBe') - parser.add_argument('pdbe', help = 'Input [.json file]: \ - PBDe (with assembly information included') - options = parser.parse_args() - setlogger() - main(options.pdbe) diff --git a/pyScripts/get_ppc_ligand_V2.py b/pyScripts/get_ppc_ligand_V2.py deleted file mode 100644 index ed1f1576c33a4f3be066d1a2326152198654130b..0000000000000000000000000000000000000000 --- a/pyScripts/get_ppc_ligand_V2.py +++ /dev/null @@ -1,268 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Author: Alexandra Moine-Franel -Date: March 2018 -Version: 3 - - - -IDENTIFY PROTEIN STRUCTURE WITH COMPOUNDS (with or without including ions) -INVOLVED IN PROTEIN-PROTEIN INTERACTION FROM PDBe - - -Usage: [Script.py] [PDBe] [IONS] ------------------------------------------------------------------- -Arguments: - [PDBe]: - Input (json format) - PDBe (with [assembly_composition], [compound_id] and [uniprot_id] included) - - [IONS]: - Input (txt format) - Compounds considered as ions - -Return: - [list(protein)]: - Output (txt format) - Proteins with compounds associated with a protein/protein complex PDB IDs - - [json(protein)]: - Output (json format) - Proteins with compounds associated with a protein/protein complex dataset - - Compounds: - --------- - ** EXAMPLE: [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID'] - or [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID', u'FE : FE (III) ION'] - - - [list(protein)]: - Output (txt format) - Proteins with compounds associated with a protein/protein complex PDB IDs - IONS ONLY EXCLUDED - - [json(protein)]: - Output (json format) - Proteins with compounds associated with a protein/protein complex dataset - IONS ONLY EXCLUDED - - Compounds: - --------- - ** EXAMPLE: [u'FE : FE (III) ION'] - - - -!Warning!: -------- -PDBs with unique and repeated assembly composition annotation are both included! -[unique annotation: 'protein/protein complex'] -[repeated annotation: 'protein/protein complex, protein/protein complex'] -""" - -# ============================================================================= - -import os -import sys -import argparse -import logging -import json - -# ============================================================================= - -LOG = logging.getLogger('Get PPC PDB IDs with ligand') -FILENAME_with_ions = 'PPC-PROT-COMPOUNDS_3-26-18PDBe' -FILENAME_without_ions = 'PPC-PROT-COMPOUNDS_noIONS_3-26-18PDBe' - -XRAY = ['X-ray diffraction', 'X-ray powder diffraction'] - -RESOLUTION = 3.5 -DIFFR = 0.05 - -# ============================================================================= - -def main(pdbe, ion): - with open(pdbe) as infile: - structure = json.load(infile) - - # Get protein/protein complex (PPC) - ppc = filter_rfactor(filter_resolution(filter_annotation( - get_ppc(structure)))) - to_txt(get_id(ppc), 'PPC_3-26-18PDBe') - - # Get protein structure with [compounds] - prot_compound = filter_rfactor(filter_resolution(filter_annotation( - get_with_compound(get_prot(structure))))) - - # Collect PPC UniProt - ppc_uniprot = get_uniprot(ppc) - - # Get PPC-associated proteins with compounds - ppc_ligand = get_prot_compound_ppc(prot_compound, ppc_uniprot) - to_json(ppc_ligand, FILENAME_with_ions) - to_txt(get_id(ppc_ligand), FILENAME_with_ions) - - # Get PPC-associated proteins with compounds (ions only not included) - ions = [] - with open(ion, 'r') as ion_file: - for i in ion_file: - ions.append(i.strip().upper()) - - prot_compound_without_ions = get_without_ions(prot_compound, ions) - ppc_ligand_without_ions = get_prot_compound_ppc(prot_compound_without_ions, ppc_uniprot) - to_json(ppc_ligand_without_ions, FILENAME_without_ions) - to_txt(get_id(ppc_ligand_without_ions), FILENAME_without_ions) - - - if os.path.exists('{}.txt'.format(FILENAME_with_ions)) and \ - os.path.exists('{}.txt'.format(FILENAME_without_ions)): - LOG.info('Finished!') - else: - LOG.error('Warning! [Outputs] Not found!') - sys.exit(1) - - - -def get_ppc(pdbe): - ppc_unique = [] - ppc_repeated = [] - for i in xrange(len(pdbe)): - if 'assembly_composition' in pdbe[i].keys(): - assembly_composition = pdbe[i]['assembly_composition'] - if len(assembly_composition) == 1 and \ - assembly_composition == ['protein/protein complex']: - ppc_unique.append(pdbe[i]) - elif len(assembly_composition) > 1: - if assembly_composition.count('protein/protein complex') >= 1 and \ - assembly_composition.count( - assembly_composition[0]) == len(assembly_composition): - ppc_repeated.append(pdbe[i]) - ppc_all = ppc_unique + ppc_repeated - return ppc_all - - -def get_prot(pdbe): - prot_unique = [] - prot_repeated = [] - for i in xrange(len(pdbe)): - if 'assembly_composition' in pdbe[i].keys(): - assembly_composition = pdbe[i]['assembly_composition'] - if len(assembly_composition) == 1 and \ - assembly_composition == ['protein structure']: - prot_unique.append(pdbe[i]) - elif len(assembly_composition) > 1: - if assembly_composition.count('protein structure') >= 1 and \ - assembly_composition.count( - assembly_composition[0]) == len(assembly_composition): - prot_repeated.append(pdbe[i]) - prot_all = prot_unique + prot_repeated - return prot_all - - -def filter_annotation(pdbe): - annotation = [] - for i in xrange(len(pdbe)): - if len(pdbe[i]['experimental_method']) == 1: - annotation.append(pdbe[i]) - return annotation - -def filter_resolution(pdbe): - quality = [] - for i in xrange(len(pdbe)): - if len([e for e in pdbe[i]['experimental_method'] if e in XRAY]) == 1: - if 'resolution' in pdbe[i].keys() and \ - float(pdbe[i]['resolution']) <= RESOLUTION: - quality.append(pdbe[i]) - return quality - -def filter_rfactor(pdbe): - quality = [] - for i in xrange(len(pdbe)): - if len([e for e in pdbe[i]['experimental_method'] if e in XRAY]) == 1: - if 'r_free' in pdbe[i].keys() and 'r_factor' in pdbe[i].keys(): - if float(pdbe[i]['r_free'] - pdbe[i]['r_factor']) <= abs(DIFFR): - quality.append(pdbe[i]) - return quality - - -def get_with_compound(pdbe): - compound = [] - for i in xrange(len(pdbe)): - if 'compound_id' in pdbe[i].keys(): - compound.append(pdbe[i]) - return compound - - -def get_uniprot(pdbe): - uniprot = [] - for i in xrange(len(pdbe)): - if 'uniprot_id' in pdbe[i].keys(): - uniprot.extend(pdbe[i]['uniprot_id']) - return uniprot - - -def get_prot_compound_ppc(protein, uniprot): - prot_compound_ppc = [] - for i in xrange(len(protein)): - if 'uniprot_id' in protein[i].keys(): - if len([e for e in protein[i]['uniprot_id'] if e in uniprot]) >= 1: - prot_compound_ppc.append(protein[i]) - return prot_compound_ppc - - -def get_without_ions(protein, ions): - without_ions = [] - with_ions_not_only = [] - for i in xrange(len(protein)): - if 'compound_id' in protein[i].keys(): - if len([e for e in protein[i]['compound_id'] if e in ions]) == 0: - without_ions.append(protein[i]) - elif len([e for e in protein[i]['compound_id'] if e in ions]) >= 1: - if len([e for e in protein[i]['compound_id'] if e in ions]) \ - != len(protein[i]['compound_id']): - with_ions_not_only.append(protein[i]) - print len(without_ions), len(with_ions_not_only) - compound_ions = without_ions + with_ions_not_only - return compound_ions - - -def get_id(pdbe): - pdb_id = [] - for i in xrange(len(pdbe)): - pdb_id.append(pdbe[i]['pdb_id']) - return pdb_id - - -def to_json(data, outfile): - with open('{}.json'.format(outfile), 'wb') as jsonfile: - json.dump(data, jsonfile) - - -def to_txt(data, outfile): - with open('{}.txt'.format(outfile), 'wb') as txtfile: - for pdb_id in data: - txtfile.write(pdb_id + '\n') - - - -def setlogger(): - LOG.setLevel(logging.INFO) - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - \ - %(levelname)s - %(message)s') - ch.setFormatter(formatter) - LOG.addHandler(ch) - -# ============================================================================= - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description = 'Get protein/protein \ - complex PDB IDs from PDBe') - parser.add_argument('pdbe', help = 'Input [.json file]: \ - PBDe (with assembly and structure quality information included') - parser.add_argument('ion', help = 'Input [.txt file]: Compounds considered \ - as ions') - options = parser.parse_args() - setlogger() - main(options.pdbe, options.ion) diff --git a/pyScripts/get_ppc_ligand_compound_from_pdbe.py b/pyScripts/get_ppc_ligand_compound_from_pdbe.py deleted file mode 100644 index a91caa2d8a00c3559d5e14213294b8308dff1b43..0000000000000000000000000000000000000000 --- a/pyScripts/get_ppc_ligand_compound_from_pdbe.py +++ /dev/null @@ -1,131 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Author: Alexandra Moine-Franel -Date: February 2018 -Version: 2 - - - -IDENTIFY INTERACTING LIGANDS AND COMPOUNDS OF PROTEIN-PROTEIN COMPLEXES FROM PDBe - - -Usage: [Script.py] [PDBe] ------------------------------------------------------------------- -Argument: - [PDBe]: - Input (json format) - PDBe (with assembly information included) - -Return: - [list(ligands)]: - Output (txt format) - Interacting ligands of protein/protein complexes - - [list(compounds)]: - Output (txt format) - Compounds of protein/protein complexes - - -Warning: -------- -PDBs with unique and repeated assembly composition annotation are both included! -[unique annotation: 'protein/protein complex'] -[repeated annotation: 'protein/protein complex, protein/protein complex'] -""" - -# ============================================================================= - -import argparse -import logging -import json -import pandas as pd - -# ============================================================================= - -LOG = logging.getLogger('Get ligands and compounds of protein/protein complexes from PDBe') - -# ============================================================================= - -def setlogger(): - LOG.setLevel(logging.INFO) - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s') - ch.setFormatter(formatter) - LOG.addHandler(ch) - - - -def main(pdbe): - structure = pd.read_json(pdbe, orient = 'columns') - ligand_to_txt(get_ligand(get_ppc_data(structure))) - compound_to_txt(get_compound(get_ppc_data(structure))) - -def get_ppc_data(pdbe): - ppc_unique_data = [] - ppc_repeated_data = [] - for i in xrange(len(pdbe['grouped']['pdb_id']['groups'])): - assembly_composition = pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['assembly_composition'] - if len(assembly_composition) == 1 and assembly_composition == ['protein/protein complex']: - ppc_unique_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) - elif len(assembly_composition) > 1: - if assembly_composition.count('protein/protein complex') >= 1 and \ - assembly_composition.count(assembly_composition[0]) == len(assembly_composition): - ppc_repeated_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) - ppc_all_data = ppc_unique_data + ppc_repeated_data - return ppc_all_data - - - -def get_ligand(ppc): - ppc_ligand = [] - unique_ligand = [] - for i in xrange(len(ppc)): - if 'interacting_ligands' in ppc[i].keys(): - ppc_ligand.append(ppc[i]['interacting_ligands'][0].split(' :')[0]) - - for j in ppc_ligand: - if j not in unique_ligand: - unique_ligand.append(j) - - #print len(unique_ligand) - return unique_ligand - - - -def get_compound(ppc): - ppc_compound = [] - unique_compound = [] - for i in xrange(len(ppc)): - if 'compound_id' in ppc[i].keys(): - ppc_compound.extend(ppc[i]['compound_id']) - - for j in ppc_compound: - if j not in unique_compound: - unique_compound.append(j) - - #print len(unique_compound) - return unique_compound - - -def ligand_to_txt(ligand): - with open('PPC_LIGANDS_2-20-18PDBe.txt', 'wb') as txtfile: - for lig in ligand: - txtfile.write(lig + '\n') - -def compound_to_txt(compound): - with open('PPC_COMPOUNDS_2-20-18PDBe.txt', 'wb') as txtfile: - for c in compound: - txtfile.write(c + '\n') - -# ============================================================================== - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description = - 'Get ligands and compounds of protein/protein complexes from PDBe') - parser.add_argument('pdbe', help = 'Input [.json file]: \ - PBDe (with interacting ligand and compound information included') - options = parser.parse_args() - setlogger() - main(options.pdbe) diff --git a/pyScripts/get_ppc_protcomp-ion_ppc_from_pdbe.py b/pyScripts/get_ppc_protcomp-ion_ppc_from_pdbe.py deleted file mode 100644 index 59a2c0ac99f63241860924a3b5001e069f73a7ac..0000000000000000000000000000000000000000 --- a/pyScripts/get_ppc_protcomp-ion_ppc_from_pdbe.py +++ /dev/null @@ -1,321 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Author: Alexandra Moine-Franel -Date: February 2018 -Version: 2 - - - -IDENTIFY PROTEIN STRUCTURE WITH COMPOUNDS (with or without including ions) -INVOLVED IN PROTEIN-PROTEIN INTERACTION FROM PDBe - - -Usage: [Script.py] [PDBe] ------------------------------------------------------------------- -Argument: - [PDBe]: - Input (json format) - PDBe (with [assembly_composition], [compound_id] and [uniprot_id] included) - -Return: - [list(protein)]: - Output (txt format) - Proteins with compounds associated with a protein/protein complex PDB IDs - - [json(protein)]: - Output (json format) - Proteins with compounds associated with a protein/protein complex dataset - - Compounds: - --------- - ** EXAMPLE: [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID'] - or [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID', u'FE : FE (III) ION'] - - - - [list(protein)]: - Output (txt format) - Proteins with compounds associated with a protein/protein complex PDB IDs - IONS ONLY EXCLUDED - - [json(protein)]: - Output (json format) - Proteins with compounds associated with a protein/protein complex dataset - IONS ONLY EXCLUDED - - Compounds: - --------- - ** EXAMPLE: [u'FE : FE (III) ION'] - - - -!Warning!: -------- -PDBs with unique and repeated assembly composition annotation are both included! -[unique annotation: 'protein/protein complex'] -[repeated annotation: 'protein/protein complex, protein/protein complex'] -""" - -# ============================================================================= - -import argparse -import logging -import json -import pandas as pd - -# ============================================================================= - -LOG = logging.getLogger('Proteins associated with a PPC - Compounds') - -# ============================================================================= - -def setlogger(): - LOG.setLevel(logging.INFO) - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s') - ch.setFormatter(formatter) - LOG.addHandler(ch) - - -def main(pdbe): - structure = pd.read_json(pdbe, orient = 'columns') - - # Get protein/protein complex [assembly_composition] - ppc = get_ppc_data(structure) - - # Get protein structure [assembly_composition] with 'compounds' - prot_compound = get_compound_id(get_prot_data(structure)) - - # Collect PPC UniProt - ppc_uniprot = get_uniprot(ppc) - - # Identify protein structure with compounds associated to PPC - to_txt(get_id_prot_compound_ppc(prot_compound, ppc_uniprot)) - to_json(get_data_prot_compound_ppc(prot_compound, ppc_uniprot)) - - - # Get protein structure [assembly_composition] with compounds (ions only not included) - ion = get_ion(get_compound_name(prot_compound)) - prot_compound_without_ions = get_compound_without_ion(prot_compound, ion) - to_txt_ion(get_id_prot_compound_ppc(prot_compound_without_ions, ppc_uniprot)) - to_json_ion(get_data_prot_compound_ppc(prot_compound_without_ions, ppc_uniprot)) - - - - -def get_ppc_data(pdbe): - ppc_unique_data = [] - ppc_repeated_data = [] - for i in xrange(len(pdbe['grouped']['pdb_id']['groups'])): - assembly_composition = pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['assembly_composition'] - if len(assembly_composition) == 1 and assembly_composition == ['protein/protein complex']: - ppc_unique_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) - elif len(assembly_composition) > 1: - if assembly_composition.count('protein/protein complex') >= 1 and \ - assembly_composition.count(assembly_composition[0]) == len(assembly_composition): - ppc_repeated_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) - ppc_all_data = ppc_unique_data + ppc_repeated_data - return ppc_all_data - - - -def get_prot_data(pdbe): - prot_unique_data = [] - prot_repeated_data = [] - for i in xrange(len(pdbe['grouped']['pdb_id']['groups'])): - assembly_composition = pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['assembly_composition'] - if len(assembly_composition) == 1 and assembly_composition == ['protein structure']: - prot_unique_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) - elif len(assembly_composition) > 1: - if assembly_composition.count('protein structure') >= 1 and \ - assembly_composition.count(assembly_composition[0]) == len(assembly_composition): - prot_repeated_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) - prot_all_data = prot_unique_data + prot_repeated_data - return prot_all_data - - - -def get_compound_id(data): - ''' - Get 3D structure with compound [compound_id] - -------------------------------------------------- - Argument: - [json]: protein structures (dataset) - - Return: - [json]: protein structures with compounds (dataset) - ''' - compound = [] - for i in xrange(len(data)): - if 'compound_id' in data[i].keys(): - compound.append(data[i]) - return compound - - - -def get_uniprot(data): - ''' - Get UniProt - ------------------------------------------------- - Argument: - [json]: protein/protein complexes (dataset) - - Return: - [list]: protein/protein complexes Uniprot IDs - ''' - uniprot = [] - for i in xrange(len(data)): - if 'uniprot_id' in data[i].keys(): - uniprot.extend(data[i]['uniprot_id']) - return uniprot - - -def get_id_prot_compound_ppc(data, uniprot): - ''' - Get PDB IDs only of protein structure with compounds associated with a protein/protein complex - ----------------------------------------------------------------------------------------------- - Arguments: - [json]: protein structures with compounds (dataset) - [list]: protein/protein complex uniprot id - - Return: - [list]: protein structure with compounds associated with a protein/protein complex PDB IDs - ''' - prot_compound_pcc = [] - for i in xrange(len(data)): - if 'uniprot_id' in data[i].keys(): - if len([e for e in data[i]['uniprot_id'] if e in uniprot]) >= 1: - prot_compound_pcc.append(data[i]['pdb_id']) - return prot_compound_pcc - - - -def get_data_prot_compound_ppc(data, uniprot): - ''' - Get data only of protein structure with compounds associated with a protein/protein complex - ----------------------------------------------------------------------------------------------- - Arguments: - [json]: protein structures with compounds (dataset) - [list]: protein/protein complex uniprot id - - Return: - [json]: protein structure with compounds associated with a protein/protein complex (dataset) - ''' - prot_compound_pcc_data = [] - for i in xrange(len(data)): - if 'uniprot_id' in data[i].keys(): - if len([e for e in data[i]['uniprot_id'] if e in uniprot]) >= 1: - prot_compound_pcc_data.append(data[i]) - return prot_compound_pcc_data - - - - -def get_compound_name(data): - ''' - Get protein compound names - -------------------------------------------------- - Argument: - [json]: protein structures with compounds (dataset) - - Return: - [list]: compound names - ** EXAMPLE: [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID', u'FE : FE (III) ION'] - ''' - compound = [] - for i in xrange(len(data)): - if 'compound_name' in data[i].keys(): - compound.extend(data[i]['compound_name']) - return compound - - - -def get_ion(compound_name): - ''' - Get protein ions - -------------------------------------------------- - Argument: - [list]: compound names - ** EXAMPLE: [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID', u'FE : FE (III) ION'] - - Return: - [list]: ions - ** EXAMPLE: ['FE'] - ''' - ion = [] - ion_unique = [] - for i in xrange(len(compound_name)): - if 'ION' in compound_name[i].split(' :')[1].split(' '): - ion.append(compound_name[i].split(' :')[0]) - for j in ion: - if j not in ion_unique: - ion_unique.append(j) - print ion_unique - return ion_unique - - - -def get_compound_without_ion(data, ions): - ''' - Get 3D structure with compound [compound_id], ion excluded - ---------------------------------------------------------- - Arguments: - [json]: protein structures (dataset) - [list]: ions - - Return: - [json]: protein structures with compounds != ions (dataset) - - !Warning! - --------- - Protein structures either with only compounds, or compounds and ions are included - ** EXAMPLE: [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID'] - or [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID', u'FE : FE (III) ION'] - - Only protein structures with only ions are excluded - ** EXAMPLE: [u'FE : FE (III) ION'] - ''' - compound_without_ion = [] - compound_with_ion = [] - for i in xrange(len(data)): - if 'compound_id' in data[i].keys(): - if len([e for e in data[i]['compound_id'] if e in ions]) == 0: - compound_without_ion.append(data[i]) - elif len([e for e in data[i]['compound_id'] if e in ions]) >= 1: - if len([e for e in data[i]['compound_id'] if e in ions]) != len(data[i]['compound_id']): - compound_with_ion.append(data[i]) - compound_ion = compound_without_ion + compound_with_ion - return compound_ion - - - -def to_txt(index): - with open('PROT-COMPOUNDS_PPC_2-20-18PDBe.txt', 'wb') as txtfile: - for i in index: - txtfile.write(i + '\n') - -def to_json(index): - with open('PROT-COMPOUNDS_PPC_2-20-18PDBe.json', 'wb') as jsonfile: - json.dump(index, jsonfile) - -def to_txt_ion(index): - with open('PROT-COMPOUNDS-noIONS_PPC_2-20-18PDBe.txt', 'wb') as txtfile: - for i in index: - txtfile.write(i + '\n') - -def to_json_ion(index): - with open('PROT-COMPOUNDS-noIONS_PPC_2-20-18PDBe.json', 'wb') as jsonfile: - json.dump(index, jsonfile) - -# ============================================================================== - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description = - 'IDENTIFY PROTEIN STRUCTURE WITH COMPOUNDS INVOLVED IN PROTEIN-PROTEIN INTERACTION FROM PDBe') - parser.add_argument('pdbe', help = 'Input [.json file]: \ - PBDe (with [assembly_composition], [compound_id] and [uniprot_id] included)') - options = parser.parse_args() - setlogger() - main(options.pdbe) diff --git a/pyScripts/iPPI-DB_descriptors_V2.py b/pyScripts/iPPI-DB_descriptors_V2.py deleted file mode 100644 index ef516ce6ee41b26aecaada0d83730c0d57572e08..0000000000000000000000000000000000000000 --- a/pyScripts/iPPI-DB_descriptors_V2.py +++ /dev/null @@ -1,320 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Author: Alexandra Moine-Franel -Date: January 2018 -Version: 2 - - - -CALCULATE 2D/3D POCKET DESCRIPTORS FROM VOLSITE - - -Usage: [Script.py] [PDB_descriptor.txt] [PDB_2D-3Ddescriptors.csv] ------------------------------------------------------------------- - -[PDB_descriptor.txt]: - Input (txt format) - Pocket descriptors calculated by VolSite - -[PDB_2D-3Ddescriptors.csv]: - Output (csv format) - Pocket descriptors including: - * 89 descriptors from VolSite - * 10 descriptors using a linear combination of VolSite descriptors - (Kuenemann, 2016 Scientific Reports) - * 10 geometric descriptors from RDKit toolkit - (http://www.rdkit.org/docs/Overview.html, version 2017.09.1) -""" - -# ============================================================================= - -import os -import sys -import argparse -import logging -import csv -import pandas as pd -from rdkit import Chem -from rdkit.Chem import Descriptors3D - -# ============================================================================= - -LOG = logging.getLogger("CALCULATE 2D/3D POCKET DESCRIPTORS FROM VOLSITE") - - -VOLSITE_HEADER = [ - 'Volume', 'CZ', 'CA', 'O', 'OD1', 'OG', 'N', 'NZ', 'DU', - 'CZ40', 'CZ40-50', 'CZ50-60', 'CZ60-70', 'CZ70-80', 'CZ80-90', 'CZ90-100', 'CZ100-110', 'CZ110-120', 'CZ120', - 'CA40', 'CA40-50', 'CA50-60', 'CA60-70', 'CA70-80', 'CA80-90', 'CA90-100', 'CA100-110', 'CA110-120', 'CA120', - 'O40', 'O40-50', 'O50-60', 'O60-70', 'O70-80', 'O80-90', 'O90-100', 'O100-110', 'O110-120', 'O120', - 'OD140', 'OD140-50', 'OD150-60', 'OD160-70', 'OD170-80', 'OD180-90', 'OD190-100', 'OD1100-110', 'OD1110-120', - 'OD1120', - 'OG40', 'OG40-50', 'OG50-60', 'OG60-70', 'OG70-80', 'OG80-90', 'OG90-100', 'OG100-110', 'OG110-120', 'OG120', - 'N40', 'N40-50', 'N50-60', 'N60-70', 'N70-80', 'N80-90', 'N90-100', 'N100-110', 'N110-120', 'N120', - 'NZ40', 'NZ40-50', 'NZ50-60', 'NZ60-70', 'NZ70-80', 'NZ80-90', 'NZ90-100', 'NZ100-110', 'NZ110-120', 'NZ120', - 'DU40', 'DU40-50', 'DU50-60', 'DU60-70', 'DU70-80', 'DU80-90', 'DU90-100', 'DU100-110', 'DU110-120', 'DU120', - 'POCKET' -] - -DESC_COMBINATION = { - 'T40': ['CZ40', 'CA40', 'O40', 'OD140', 'OG40', 'N40', 'NZ40', 'DU40'], - 'T40-50': ['CZ40-50', 'CA40-50', 'O40-50', 'OD140-50', 'OG40-50', 'N40-50', 'NZ40-50', 'DU40-50'], - 'T50-60': ['CZ50-60', 'CA50-60', 'O50-60', 'OD150-60', 'OG50-60', 'N50-60', 'NZ50-60', 'DU50-60'], - 'T60-70': ['CZ60-70', 'CA60-70', 'O60-70', 'OD160-70', 'OG60-70', 'N60-70', 'NZ60-70', 'DU60-70'], - 'T70-80': ['CZ70-80', 'CA70-80', 'O70-80', 'OD170-80', 'OG70-80', 'N70-80', 'NZ70-80', 'DU70-80'], - 'T80-90': ['CZ80-90', 'CA80-90', 'O80-90', 'OD180-90', 'OG80-90', 'N80-90', 'NZ80-90', 'DU80-90'], - 'T90-100': ['CZ90-100', 'CA90-100', 'O90-100', 'OD190-100', 'OG90-100', 'N90-100', 'NZ90-100', 'DU90-100'], - 'T100-110': ['CZ100-110', 'CA100-110', 'O100-110', 'OD1100-110', 'OG100-110', 'N100-110', 'NZ100-110', 'DU100-110'], - 'T110-120': ['CZ110-120', 'CA110-120', 'O110-120', 'OD1110-120', 'OG110-120', 'N110-120', 'NZ110-120', 'DU110-120'], -} - -DESC_GEOM = [ - 'PMI1', 'PMI2', 'PMI3', 'NPR1', 'NPR2', 'Rgyr', - 'Asphericity', 'SpherocityIndex', 'Eccentricity', 'InertialShapeFactor' -] - -# ============================================================================= - -def setlogger(): - LOG.setLevel(logging.INFO) - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - \ - %(levelname)s - %(message)s') - ch.setFormatter(formatter) - LOG.addHandler(ch) - - -def main(volsitedescriptor, pocketdescriptor): - volsite_csv = descriptor_volsite(volsitedescriptor) - combination_csv = descriptor_combination(volsite_csv) - volsite_combination_csv = "{}volsite+combinaison.csv".format( - volsite_csv.rsplit("volsite.csv")[0]) - merge_csv(volsite_csv, combination_csv, volsite_combination_csv) - geometric_csv = descriptor_geometric(volsitedescriptor) - merge_csv(volsite_combination_csv, geometric_csv, pocketdescriptor) - - if os.path.exists(pocketdescriptor): - os.remove(volsite_csv) - os.remove(combination_csv) - os.remove(volsite_combination_csv) - os.remove(geometric_csv) - LOG.info('Finished!') - else: - LOG.error('Error: 2D/3D Pocket Descriptor file not found') - sys.exit(1) - - -def descriptor_volsite(txtfile): - """ - Convert VolSite output file from txt format to csv format - (descriptor headers added) - --------------------------------------------------------- - Argument: - [file]: VolSite descriptors (txt format) - - Return: - [file]: VolSite descriptors (csv format) - with descriptor headers - """ - try: - with open(txtfile, 'rb') as infile: - volsite_csv = "{}_volsite.csv".format(os.path.splitext(txtfile)[0]) - with open(volsite_csv, 'wb') as outfile: - in_text = csv.reader(infile, delimiter=' ') - out_csv = csv.writer(outfile) - out_csv.writerow(VOLSITE_HEADER) - for line in in_text: - out_csv.writerow(line) - except IOError, e: - LOG.error('{}'.format(e)) - sys.exit(1) - return volsite_csv - - -def descriptor_combination(volsite_descriptor_csv): - """ - Calculate and write linear combinations of VolSite descriptors to csv file - (Kuenemann, 2016 Scientific Reports) - --------------------------------------------------------------------------- - Argument: - [file]: VolSite descriptors (csv format) - - Return: - [file]: linear combinations of VolSite descriptors (csv format) - """ - with open(volsite_descriptor_csv, 'rb') as infile: - volsite = csv.reader(infile, delimiter = ',') - descriptors = volsite.next() - - # Create a dictionnary - # key(descriptor): values(list of all pocket values) - data = {} - # Count the number of pockets - numbpocket = 0 - for header in descriptors: - data[header] = [] - for pocket in volsite: - numbpocket += + 1 - for header, value in zip(descriptors, pocket): - data[header].append(value) - combination_csv = "{}combination.csv".format( - volsite_descriptor_csv.rsplit("volsite.csv")[0]) - with open(combination_csv, 'wb') as outfile: - combivolsite = csv.writer(outfile, delimiter = ',', - lineterminator = '\n') - # Descriptor headers and values have to be in variables - # to write them in a new file column by column (sep = comma) - descname = [] - cdata = [] - for dch in DESC_COMBINATION.keys(): - # dch = descriptor combination header (column) - bsum = [] - for p in range(numbpocket): - # p = pocket (row) - buriedness_values = [] - for colname in DESC_COMBINATION[dch]: - # Get buriedness value by descriptors - buriedness_values.append(float(data[colname][p])) - # Get sum of buriedness value by descriptor combinations - bsum.append(sum(buriedness_values)) - descname.append(dch) - cdata.append(bsum) - combivolsite.writerow(descname) - datatransposed = zip(*cdata) - for row in datatransposed: - combivolsite.writerow(row) - return combination_csv - - -def descriptor_geometric(txtfile): - """ - Write geometric descriptor from the negative image of binding pockets - detected by VolSite to csv file - --------------------------------------------------------------------------- - Argument: - [file]: VolSite descriptors (txt format) - - Return: - [file]: shape descriptors (csv format) - """ - with open(txtfile, 'rb') as infile: - pocketvolsite = csv.reader(infile, delimiter = ',') - # Get the number of pockets - numbpocket = 0 - for pocket in pocketvolsite: - numbpocket += + 1 - geometric_csv = "{}geometric.csv".format(os.path.splitext(txtfile)[0]) - with open(geometric_csv, 'wb') as outfile: - shape = csv.writer(outfile, delimiter = ',', lineterminator = '\n') - shape.writerow(DESC_GEOM) - # Calculate geometric descriptors pocket by pockets - for cavity in range(1, numbpocket + 1): - pocket_mol2 = "{}_CAVITY_N{}_ALL.mol2".format( - geometric_csv.rsplit("_")[0], cavity) - replace_mol2atom(pocket_mol2) - shape.writerow(calculate_shape_descriptor( - "temp_{}_CAVITY_N{}_ALL.mol2".format( - geometric_csv.rsplit("_")[0], cavity))) - os.remove("temp_{}_CAVITY_N{}_ALL.mol2".format( - geometric_csv.rsplit("_")[0], cavity)) - return geometric_csv - - -def replace_mol2atom(cavity): - """ - Replace the atom name assigned by VolSite to the probes by 'C' (carbone) - - Warning 1 - Fixing the RDKit error: 'atom' with a degree > 1 - - Warning 2 - RDKit needs the output file [PDB_CAVITY_Nx_ALL_temp.mol2] - NOT the link - ------------------------------------------------------------------ - Argument: - [file]: negative image of a binding pocket (mol2 format) - - Return: - [file]: negative image of a binding pocket with carbone as atom name - (mol2 format, temporary file used as input by RDKit) - """ - header = True - parsing = False - with open(cavity, 'rb') as infile: - with open("temp_{}.mol2".format(cavity.rsplit('.')[0]), 'wb') as tempf: - for line in infile: - if line.startswith('@<TRIPOS>ATOM'): - tempf.write(line)# + '\n') - header = False - parsing = True - continue - elif line.startswith('@<TRIPOS>BOND'): - parsing = False - - if header: - tempf.write(line) - if parsing: - tempf.write(line.replace(line.split()[5], 'C')) - #return tempf - - -def calculate_shape_descriptor(cavity): - """ - Calculate geometric descriptors with RDKit - ------------------------------------------ - Geometric descriptor details: - http://www.rdkit.org/Python_Docs/rdkit.Chem.Descriptors3D-module.html - - Argument: - [file]: negative image of a binding pocket (mol2 format) - - Return: - [list]: geometric descriptors - """ - mol2file = Chem.MolFromMol2File(cavity, sanitize = False, removeHs = False) - geom_val = [] - geom_val.append(Descriptors3D.PMI1(mol2file)) - geom_val.append(Descriptors3D.PMI2(mol2file)) - geom_val.append(Descriptors3D.PMI3(mol2file)) - geom_val.append(Descriptors3D.NPR1(mol2file)) - geom_val.append(Descriptors3D.NPR2(mol2file)) - geom_val.append(Descriptors3D.RadiusOfGyration(mol2file)) - geom_val.append(Descriptors3D.Asphericity(mol2file)) - geom_val.append(Descriptors3D.SpherocityIndex(mol2file)) - geom_val.append(Descriptors3D.Eccentricity(mol2file)) - geom_val.append(Descriptors3D.InertialShapeFactor(mol2file)) - return geom_val - - -def merge_csv(csv_infile1, csv_infile2, csv_outfile): - """ - Concatenate two csv files - ------------------------- - Arguments: - [file]: input (csv format) - [file]: input (csv format) - [file]: output filename (csv format) - """ - csv1 = pd.read_csv(csv_infile1) - csv2 = pd.read_csv(csv_infile2) - csv3 = pd.concat([csv1, csv2], axis = 1) - return csv3.to_csv(csv_outfile, sep = ',', index = False) - -# ============================================================================== - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description = - 'Calculate 2D/3D Pocket Descriptors from VolSite') - parser.add_argument('volsitedescriptor', help = 'Input [.txt file]: \ - Pocket descriptors calculated by VolSite') - parser.add_argument('pocketdescriptor', help = 'Output [.csv filename]: \ - Pocket descriptors including \ - 89 descriptors from VolSite, \ - 10 descriptors using a combination of VolSite descriptors \ - and 10 geometric descriptors') - - options = parser.parse_args() - setlogger() - main(options.volsitedescriptor, options.pocketdescriptor) diff --git a/pyScripts/ion.pdf b/pyScripts/ion.pdf deleted file mode 100644 index 46ecfa3582acda4ea7252068a70a9f9a1259d6f8..0000000000000000000000000000000000000000 Binary files a/pyScripts/ion.pdf and /dev/null differ diff --git a/pyScripts/ion.txt b/pyScripts/ion.txt deleted file mode 100644 index c2ec67982891e76539ee9654375ae4c8acb73de8..0000000000000000000000000000000000000000 --- a/pyScripts/ion.txt +++ /dev/null @@ -1,63 +0,0 @@ - -3NI -4MO -4PU -4TI -6MO -AG -AL -AU -AU3 -AUC -BA -BR -BS3 -CA -CD -CE -CF -CL -CO -CR -CS -CU -CU1 -ER3 -EU -EU3 -F -FE -FE2 -GA -HG -IOD -IR -IR3 -K -LA -LI -LU -MG -MN -MN3 -MOO -NA -NI -PD -PR -PT -PT4 -RB -RU -SB -SE4 -SM -SR -TB -TH -THE -TL -W -ZN -ZR - diff --git a/pyScripts/ppiinterface_dist.py b/pyScripts/ppiinterface_dist.py deleted file mode 100644 index e79e69ac85a5c9070b319f5dfa89d29c2570756f..0000000000000000000000000000000000000000 --- a/pyScripts/ppiinterface_dist.py +++ /dev/null @@ -1,335 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Author: Alexandra Moine-Franel -Date: February 2018 -Version: 4 - - -IDENTIFY PROTEIN INTERFACE / POCKET RESIDUES - -Determine protein residues at a specific distance either from protein partner, -ligand (small molecule) or negative image of the pocket generated by VolSite. - - - -Usage: [Script.py] [TARGET] [PARTNER] -d (DISTANCE) -------------------------------------------------------------------------------- -[TARGET]: - Input (.mol2 or .pdb format) - Protein CHAIN - -[PARTNER]: - Input (.mol2 or .pdb format) - Negative image of the pocket generated by VolSite (.mol2 format) - OR Ligand (.pdb format) - OR Protein (.pdb format) - -(DISTANCE)- optionnal - Input (float or integer) - Distance threshold between the target and the partner - * Distance target-protein/ligand = 5 angstroms - by defaut - * Distance target-cavity = 3.5 angstroms - by default - ------------------- - -[POCKET_RESIDUES]: - Output (.txt format) - 'PDB-targetchain-partnerchain_distance.txt' (target-protein/ligand) - OR 'PDB-targetchain-CAVITY_Nx_ALL_distance.txt' (target-cavity) -""" - -# ============================================================================= - -import csv -import os -import sys -import argparse -import logging -#import time -from biopandas.mol2 import PandasMol2 -from biopandas.pdb import PandasPdb - -# ============================================================================= - -LOG = logging.getLogger("IDENTIFY PROTEIN INTERFACE / POCKET RESIDUES") - -# ============================================================================= - -def main(target, partner, distance): - #start = time.time() - if target and partner: - if os.path.splitext(target)[-1] == '.mol2' and \ - os.path.splitext(partner)[-1] == '.mol2': - mol2target = parse_mol2(target) - mol2partner = parse_mol2(partner) - mol2_respocket(mol2target, mol2partner, distance) - if os.path.exists('{}-{}_{}.txt'.format( \ - mol2target.code, mol2partner.code.rsplit('.mol2')[0], \ - distance)): - LOG.info('Finished!') - #end = time.time() - #print(end - start) - else: - LOG.error('Warning! [Output] Not found!') - sys.exit(1) - elif os.path.splitext(target)[-1] == '.pdb' and \ - os.path.splitext(partner)[-1] == '.pdb': - pdbtarget = parse_pdb(target) - pdbpartner = parse_pdb(partner) - if pdbpartner.df['ATOM'].empty == True: - pdb_resinterfacePL(pdbtarget, pdbpartner, distance) - if os.path.exists('{}-{}-{}_{}.txt'.format(pdbtarget.code, \ - pdbtarget.df['ATOM']['chain_id'][0], pdbpartner.df['HETATM']['residue_name'][0], \ - distance)): - LOG.info('Finished!') - #end = time.time() - #print(end - start) - else: - LOG.error('Warning! [Output] Not found!') - sys.exit(1) - else: - pdb_resinterfacePP(pdbtarget, pdbpartner, distance) - if os.path.exists('{}-{}-{}_{}.txt'.format(pdbtarget.code, \ - pdbtarget.df['ATOM']['chain_id'][0], pdbpartner.df['ATOM']['chain_id'][0], \ - distance)): - LOG.info('Finished!') - #end = time.time() - #print(end - start) - else: - LOG.error('Warning! [Output] Not found!') - sys.exit(1) - else: - LOG.error('Error! [TARGET] or [PARTNER] file not in .mol2 or .pdb format') - sys.exit(1) - elif target and not partner: - parser.error('Error: [PARTNER: protein, ligand or cavity] is required') - - - - -def parse_mol2(mol2file): - """ - Parse .mol2 file - --------------------------------------------------------------------------- - Argument: [file]: .mol2 format - Return: [dataframe] - """ - try: - pmol = PandasMol2().read_mol2(mol2file) - except(IOError), e: - LOG.error('{}'.format(e)) - sys.exit(1) - else: - return pmol - - -def parse_pdb(pdbfile): - """ - Parse .pdb file - --------------------------------------------------------------------------- - Argument: [file]: .pdb format - Return: [dataframe] - """ - try: - ppdb = PandasPdb().read_pdb(pdbfile) - except(IOError), e: - LOG.error('{}'.format(e)) - sys.exit(1) - else: - return ppdb - - -def mol2_respocket(mol2target, mol2partner, distance): - """ - Identify protein target residues at a specified distance from its partner - (i.e. the cavity negative image generated by VolSite). - --------------------------------------------------------------------------- - Arguments: - [file]: protein target (.mol2 format) - [file]: negative image of the binding pocket (.mol2 format) - [float]: distance threshold (by default, 3.5 angstroms) - Return: - [file]: pocket residues (.txt format) - """ - ppires = [] - with open('{}-{}_{}.txt'.format(mol2target.code, \ - mol2partner.code.rsplit('.mol2')[0], \ - distance), 'wb') as outfile: - for probe in xrange(len(mol2partner.df.atom_id)): - for atom in xrange(len(mol2target.df.atom_id)): - if mol2target.df.subst_name[atom] not in ppires: - xp = mol2target.df['x'][atom] - xc = mol2partner.df['x'][probe] - distX = x_dist(xp, xc) - if distX < float(distance): - yp = mol2target.df['y'][atom] - yc = mol2partner.df['y'][probe] - distXY = xy_dist(xp, yp, xc, yc) - if distXY < float(distance): - zp = mol2target.df['z'][atom] - zc = mol2partner.df['z'][probe] - distXYZ = euclidian_dist(xp, yp, zp, xc, yc, zc) - if distXYZ < float(distance): - ppires.append(mol2target.df.subst_name[atom]) - for resp in ppires: - outfile.write(resp + '\n') - - -def pdb_resinterfacePP(pdbtarget, pdbpartner, distance): - """ - Identify protein target residues at a specified distance from its partner - (i.e. protein) - --------------------------------------------------------------------------- - Arguments: - [file]: protein target (.pdb format) - [file]: protein partner (.pdb format) - [float]: distance threshold (by default, 5 angstroms) - Return: - [file]: interface residues (.txt format) - """ - ppires = [] - with open('{}-{}-{}_{}.txt'.format(pdbtarget.code, \ - pdbtarget.df['ATOM']['chain_id'][0], pdbpartner.df['ATOM']['chain_id'][0], \ - distance), 'wb') as outfile: - for atomt in xrange(len(pdbpartner.df['ATOM']['atom_number'])): - for atomp in xrange(len(pdbtarget.df['ATOM']['atom_number'])): - if ''.join(map(str, (pdbtarget.df['ATOM']['residue_name'][atomp], \ - pdbtarget.df['ATOM']['residue_number'][atomp]))) not in ppires: - xp = pdbtarget.df['ATOM']['x_coord'][atomp] - xc = pdbpartner.df['ATOM']['x_coord'][atomt] - distX = x_dist(xp, xc) - if distX < float(distance): - yp = pdbtarget.df['ATOM']['y_coord'][atomp] - yc = pdbpartner.df['ATOM']['y_coord'][atomt] - distXY = xy_dist(xp, yp, xc, yc) - if distXY < float(distance): - zp = pdbtarget.df['ATOM']['z_coord'][atomp] - zc = pdbpartner.df['ATOM']['z_coord'][atomt] - distXYZ = euclidian_dist(xp, yp, zp, xc, yc, zc) - if distXYZ < float(distance): - ppires.append(''.join(map(str, \ - (pdbtarget.df['ATOM']['residue_name'][atomp], \ - pdbtarget.df['ATOM']['residue_number'][atomp])))) - for resp in ppires: - outfile.write(resp + '\n') - - -def pdb_resinterfacePL(pdbtarget, pdbpartner, distance): - """ - Identify protein target residues at a specified distance from its partner - (i.e. ligand) - --------------------------------------------------------------------------- - Arguments: - [file]: protein target (.pdb format) - [file]: ligand (.pdb format) - [float]: distance threshold (by default, 5 angstroms) - Return: - [file]: pocket residues (.txt format) - """ - ppires = [] - with open('{}-{}-{}_{}.txt'.format(pdbtarget.code, \ - pdbtarget.df['ATOM']['chain_id'][0], pdbpartner.df['HETATM']['residue_name'][0], \ - distance), 'wb') as outfile: - for atomt in xrange(len(pdbpartner.df['HETATM']['atom_number'])): - for atomp in xrange(len(pdbtarget.df['ATOM']['atom_number'])): - if ''.join(map(str, (pdbtarget.df['ATOM']['residue_name'][atomp], \ - pdbtarget.df['ATOM']['residue_number'][atomp]))) not in ppires: - xp = pdbtarget.df['ATOM']['x_coord'][atomp] - xc = pdbpartner.df['HETATM']['x_coord'][atomt] - distX = x_dist(xp, xc) - if distX < float(distance): - yp = pdbtarget.df['ATOM']['y_coord'][atomp] - yc = pdbpartner.df['HETATM']['y_coord'][atomt] - - distXY = xy_dist(xp, yp, xc, yc) - if distXY < float(distance): - zp = pdbtarget.df['ATOM']['z_coord'][atomp] - zc = pdbpartner.df['HETATM']['z_coord'][atomt] - distXYZ = euclidian_dist(xp, yp, zp, xc, yc, zc) - if distXYZ < float(distance): - ppires.append(''.join(map(str, \ - (pdbtarget.df['ATOM']['residue_name'][atomp], \ - pdbtarget.df['ATOM']['residue_number'][atomp])))) - for resp in ppires: - outfile.write(resp + '\n') - - -def x_dist(x1, x2): - """ - Calculate the distance x between two 1D points - ---------------------------------------------------------------------- - Arguments: - [float]: coordinate x of target atom - coordinate x of partnet atom - Return: - [float]: distance - """ - dist = ((x1-x2)**2) ** 0.5 - return dist - - -def xy_dist(x1, y1, x2, y2): - """ - Calculate the distance xy between two 2D points - ---------------------------------------------------------------------- - Arguments: - [float]: coordinates xy of target atom - coordinates xy of partnet atom - Return: - [float]: distance - """ - dist = ((x1-x2)**2 + (y1-y2)**2) ** 0.5 - return dist - - -def euclidian_dist(x1, y1, z1, x2, y2, z2): - """ - Calculate the euclidian distance xyz between two 3D points - ---------------------------------------------------------------------- - Arguments: - [float]: coordinates xyz of target atom - coordinates xyz of partnet atom - Return: - [float]: distance - """ - dist = ((x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2) ** 0.5 - return dist - - - -def setlogger(): - LOG.setLevel(logging.INFO) - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - \ - %(levelname)s - %(message)s') - ch.setFormatter(formatter) - LOG.addHandler(ch) - - -def default_distance(options): - if os.path.splitext(options.partner)[-1] == '.mol2': - options.distance = 3.5 - elif os.path.splitext(options.partner)[-1] == '.pdb': - options.distance = 5.0 - return options.distance - -# ============================================================================== - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description = 'Identify interface/pocket residues') - parser.add_argument('target', \ - help = 'Input [.mol2 or .pdb file]: protein target') - parser.add_argument('partner', \ - help = 'Input [.pdb file]: protein partner or ligand; OR [.mol2 file]: cavity') - parser.add_argument('-d', dest = 'distance', type = float, \ - help = 'Input [float or integer]: distance threshold \ - {by default, target-protein/ligand = 5A; target-cavity = 3.5A}') - options = parser.parse_args() - - if options.distance is None: - default_distance(options) - - setlogger() - main(options.target, options.partner, options.distance)