diff --git a/ippisite/db.sqlite3.REMOVED.git-id b/ippisite/db.sqlite3.REMOVED.git-id index b63f9dc481dd16289e6aba0b596ae5129a6a3db1..d23a7914fd588b82f000b3e0267b142b66545d6d 100644 --- a/ippisite/db.sqlite3.REMOVED.git-id +++ b/ippisite/db.sqlite3.REMOVED.git-id @@ -1 +1 @@ -6def815c597d9d9c8c7e97e6b7e8fd71da7d64ac \ No newline at end of file +b3413ac6f067ab89b2dad63c444b3021ffeef640 \ No newline at end of file diff --git a/ippisite/ippidb/management/commands/import_v1_data.py b/ippisite/ippidb/management/commands/import_v1_data.py index 6e4a63edca6ae664edc8b58df560beebd89540c2..e525b46f460e13f0a66b0eef7a9fd99d9b27e9a5 100644 --- a/ippisite/ippidb/management/commands/import_v1_data.py +++ b/ippisite/ippidb/management/commands/import_v1_data.py @@ -560,6 +560,7 @@ select distinct protein.NumUniprot, domain.PfamNumAccession , complexe.NbCopy, for row in rows: try: compound = Compound() + compound.id = row[0] compound.canonical_smile = row[1] compound.is_macrocycle = (row[4] == 'Y') compound.aromatic_ratio = row[5] diff --git a/pyScripts/get_pdb_structure.py b/pyScripts/get_pdb_structure.py new file mode 100644 index 0000000000000000000000000000000000000000..d5dd27838698a9550b60550d5b1d87dab7252d36 --- /dev/null +++ b/pyScripts/get_pdb_structure.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +""" +Author: Alexandra Moine-Franel +Date: March 2018 +Version: 1 + + +DOWNLOAD PDB 3D STRUCTURE FILES FROM PDB + + +Usage: [Script.py] [pdb_id] +------------------------------------------------------------------ +Argument: + [file]: list of PDB IDs (txt format) + +Return: + [file]: PDB files, stored in 'PDB' folder +""" + +# ============================================================================= + +import os +import argparse +import logging +import csv +import Bio +from Bio.PDB import PDBList + +# ============================================================================= + +LOG = logging.getLogger("DOWNLOAD PDB 3D STRUCTURE FILES") + +FOLDER = 'PDB' + +# ============================================================================= + +def main(pdb_id): + get_pdb_file(get_pdbID(pdb_id)) + rename_pdb_file() + + +def get_pdbID(txtfile): + pdb_id = [] + with open(txtfile, 'rb') as pdb_txtfile: + for line in csv.reader(pdb_txtfile): + pdb_id.append(line[0].upper()) + return pdb_id + + +def get_pdb_file(pdbID): + for i in pdbID: + PDBList().retrieve_pdb_file(i, pdir = FOLDER, file_format = 'pdb') + + +def rename_pdb_file(): + working_dir = os.path.dirname(__file__) + pdb_dir = os.path.abspath(os.path.join(working_dir, FOLDER)) + os.chdir(pdb_dir) + for filename in os.listdir(pdb_dir): + os.rename(filename, filename.replace('pdb', '').replace('ent', 'pdb')) + + +def setlogger(): + LOG.setLevel(logging.INFO) + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s') + ch.setFormatter(formatter) + LOG.addHandler(ch) + +# ============================================================================= + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description = + 'DOWNLOAD PDB 3D STRUCTURE FILES FROM PDB') + parser.add_argument('pdb_id', help = 'Input [.txt file]: PDB IDs') + options = parser.parse_args() + setlogger() + main(options.pdb_id) diff --git a/pyScripts/get_ppc_V2.py b/pyScripts/get_ppc_V2.py new file mode 100644 index 0000000000000000000000000000000000000000..6d801b9c9a1e30ed70137e206681968e4bca6a66 --- /dev/null +++ b/pyScripts/get_ppc_V2.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- + +""" +Author: Alexandra Moine-Franel +Date: March 2018 +Version: 2 + + + +IDENTIFY PROTEIN-PROTEIN COMPLEXES FROM PDBe + + +Usage: [Script.py] [PDBe] +------------------------------------------------------------------ +Argument: + [PDBe]: + Input (json format) + PDBe (with [assembly_composition] and [assembly_type] information included) + +Return: + [list(PPC)]: + Output (txt format) + Protein/protein complex PDB IDs + + [json(PPC)]: + Output (json format) + Protein/protein complex information + + +Warning: +------- +PDBs with unique and repeated assembly composition annotation are both included! +[unique annotation: 'protein/protein complex'] +[repeated annotation: 'protein/protein complex, protein/protein complex'] +""" + +# ============================================================================= + +import os +import sys +import argparse +import logging +import json + +# ============================================================================= + +LOG = logging.getLogger('Get PPC PDB IDs from PDBe') +FILENAME = 'PPC-DIMER_3-26-18PDBe' + +XRAY = ['X-ray diffraction', 'X-ray powder diffraction'] + +RESOLUTION = 3.5 +DIFFR = 0.05 + +# ============================================================================= + +def main(pdbe): + with open(pdbe) as infile: + structure = json.load(infile) + dimer = get_dimer(get_ppc(structure)) + dimer_filt = filter_rfactor(filter_resolution(filter_annotation(dimer))) + to_json(dimer_filt, FILENAME) + to_txt(get_id(dimer_filt), FILENAME) + + if os.path.exists('{}.txt'.format(FILENAME)): + LOG.info('Finished!') + else: + LOG.error('Warning! [Output] Not found!') + sys.exit(1) + + + +def get_ppc(pdbe): + ppc_unique = [] + ppc_repeated = [] + for i in xrange(len(pdbe)): + if 'assembly_composition' in pdbe[i].keys(): + assembly_composition = pdbe[i]['assembly_composition'] + if len(assembly_composition) == 1 and \ + assembly_composition == ['protein/protein complex']: + ppc_unique.append(pdbe[i]) + elif len(assembly_composition) > 1: + if assembly_composition.count('protein/protein complex') >= 1 and \ + assembly_composition.count( + assembly_composition[0]) == len(assembly_composition): + ppc_repeated.append(pdbe[i]) + ppc_all = ppc_unique + ppc_repeated + return ppc_all + + +def get_dimer(pdbe): + dimer_unique = [] + dimer_repeated = [] + for i in xrange(len(pdbe)): + if 'assembly_type' in pdbe[i].keys(): + assembly_type = pdbe[i]['assembly_type'] + if len(assembly_type) == 1 and \ + assembly_type == ['dimer']: + dimer_unique.append(pdbe[i]) + elif len(assembly_type) > 1: + if assembly_type.count('dimer') >= 1 and \ + assembly_type.count(assembly_type[0]) == len(assembly_type): + dimer_repeated.append(pdbe[i]) + dimer_all = dimer_unique + dimer_repeated + return dimer_all + + +def filter_annotation(pdbe): + annotation = [] + for i in xrange(len(pdbe)): + if len(pdbe[i]['experimental_method']) == 1: + annotation.append(pdbe[i]) + return annotation + +def filter_resolution(pdbe): + quality = [] + for i in xrange(len(pdbe)): + if len([e for e in pdbe[i]['experimental_method'] if e in XRAY]) == 1: + if 'resolution' in pdbe[i].keys() and \ + float(pdbe[i]['resolution']) <= RESOLUTION: + quality.append(pdbe[i]) + return quality + +def filter_rfactor(pdbe): + quality = [] + for i in xrange(len(pdbe)): + if len([e for e in pdbe[i]['experimental_method'] if e in XRAY]) == 1: + if 'r_free' in pdbe[i].keys() and 'r_factor' in pdbe[i].keys(): + if float(pdbe[i]['r_free'] - pdbe[i]['r_factor']) <= abs(DIFFR): + quality.append(pdbe[i]) + return quality + + +def get_id(pdbe): + pdb_id = [] + for i in xrange(len(pdbe)): + pdb_id.append(pdbe[i]['pdb_id']) + return pdb_id + + +def to_json(data, outfile): + with open('{}.json'.format(outfile), 'wb') as jsonfile: + json.dump(data, jsonfile) + + +def to_txt(data, outfile): + with open('{}.txt'.format(outfile), 'wb') as txtfile: + for pdb_id in data: + txtfile.write(pdb_id + '\n') + + + +def setlogger(): + LOG.setLevel(logging.INFO) + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - \ + %(levelname)s - %(message)s') + ch.setFormatter(formatter) + LOG.addHandler(ch) + +# ============================================================================= + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description = 'Get protein/protein \ + complex PDB IDs from PDBe') + parser.add_argument('pdbe', help = 'Input [.json file]: \ + PBDe (with assembly and structure quality information included') + options = parser.parse_args() + setlogger() + main(options.pdbe) diff --git a/pyScripts/get_ppc_id_from_pdbe.py b/pyScripts/get_ppc_id_from_pdbe.py new file mode 100644 index 0000000000000000000000000000000000000000..e41157a8c94a05634d5bcdfaea385670bb4a5abc --- /dev/null +++ b/pyScripts/get_ppc_id_from_pdbe.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- + +""" +Author: Alexandra Moine-Franel +Date: February 2018 +Version: 1 + + + +IDENTIFY PROTEIN-PROTEIN COMPLEXES FROM PDBe + + +Usage: [Script.py] [PDBe] +------------------------------------------------------------------ +Argument: + [PDBe]: + Input (json format) + PDBe (with assembly information included) + +Return: + [list(PPC)]: + Output (txt format) + Protein/protein complex PDB IDs + + [json(PPC)]: + Output (json format) + Protein/protein complex information + + +Warning: +------- +PDBs with unique and repeated assembly composition annotation are both included! +[unique annotation: 'protein/protein complex'] +[repeated annotation: 'protein/protein complex, protein/protein complex'] +""" + +# ============================================================================= + +import argparse +import logging +import json +import pandas as pd + +# ============================================================================= + +LOG = logging.getLogger('Get protein/protein complex PDB IDs from PDBe') + +# ============================================================================= + +def setlogger(): + LOG.setLevel(logging.INFO) + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s') + ch.setFormatter(formatter) + LOG.addHandler(ch) + + + +def main(pdbe): + structure = pd.read_json(pdbe, orient = 'columns') + ppc_to_txt(get_ppc_id(structure)) + ppc_to_json(get_ppc_data(structure)) + + + +def get_ppc_id(pdbe): + ppc_unique = [] + ppc_repeated = [] + for i in xrange(len(pdbe['grouped']['pdb_id']['groups'])): + assembly_composition = pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['assembly_composition'] + if len(assembly_composition) == 1 and assembly_composition == ['protein/protein complex']: + ppc_unique.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['pdb_id']) + elif len(assembly_composition) > 1: + if assembly_composition.count('protein/protein complex') >= 1 and \ + assembly_composition.count(assembly_composition[0]) == len(assembly_composition): + ppc_repeated.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['pdb_id']) + ppc_all = ppc_unique + ppc_repeated + return ppc_all + + + +def get_ppc_data(pdbe): + ppc_unique_data = [] + ppc_repeated_data = [] + for i in xrange(len(pdbe['grouped']['pdb_id']['groups'])): + assembly_composition = pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['assembly_composition'] + if len(assembly_composition) == 1 and assembly_composition == ['protein/protein complex']: + ppc_unique_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) + elif len(assembly_composition) > 1: + if assembly_composition.count('protein/protein complex') >= 1 and \ + assembly_composition.count(assembly_composition[0]) == len(assembly_composition): + ppc_repeated_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) + ppc_all_data = ppc_unique_data + ppc_repeated_data + return ppc_all_data + + + +def ppc_to_txt(ppc): + with open('PPC_2-20-18PDBe.txt', 'wb') as txtfile: + for pdb_id in ppc: + txtfile.write(pdb_id + '\n') + + + +def ppc_to_json(ppc): + with open('PPC_2-20-18PDBe.json', 'wb') as jsonfile: + json.dump(ppc, jsonfile) + + + +# ============================================================================== + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description = + 'Get protein/protein complex PDB IDs from PDBe') + parser.add_argument('pdbe', help = 'Input [.json file]: \ + PBDe (with assembly information included') + options = parser.parse_args() + setlogger() + main(options.pdbe) diff --git a/pyScripts/get_ppc_ligand_V2.py b/pyScripts/get_ppc_ligand_V2.py new file mode 100644 index 0000000000000000000000000000000000000000..ed1f1576c33a4f3be066d1a2326152198654130b --- /dev/null +++ b/pyScripts/get_ppc_ligand_V2.py @@ -0,0 +1,268 @@ +# -*- coding: utf-8 -*- + +""" +Author: Alexandra Moine-Franel +Date: March 2018 +Version: 3 + + + +IDENTIFY PROTEIN STRUCTURE WITH COMPOUNDS (with or without including ions) +INVOLVED IN PROTEIN-PROTEIN INTERACTION FROM PDBe + + +Usage: [Script.py] [PDBe] [IONS] +------------------------------------------------------------------ +Arguments: + [PDBe]: + Input (json format) + PDBe (with [assembly_composition], [compound_id] and [uniprot_id] included) + + [IONS]: + Input (txt format) + Compounds considered as ions + +Return: + [list(protein)]: + Output (txt format) + Proteins with compounds associated with a protein/protein complex PDB IDs + + [json(protein)]: + Output (json format) + Proteins with compounds associated with a protein/protein complex dataset + + Compounds: + --------- + ** EXAMPLE: [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID'] + or [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID', u'FE : FE (III) ION'] + + + [list(protein)]: + Output (txt format) + Proteins with compounds associated with a protein/protein complex PDB IDs + IONS ONLY EXCLUDED + + [json(protein)]: + Output (json format) + Proteins with compounds associated with a protein/protein complex dataset + IONS ONLY EXCLUDED + + Compounds: + --------- + ** EXAMPLE: [u'FE : FE (III) ION'] + + + +!Warning!: +------- +PDBs with unique and repeated assembly composition annotation are both included! +[unique annotation: 'protein/protein complex'] +[repeated annotation: 'protein/protein complex, protein/protein complex'] +""" + +# ============================================================================= + +import os +import sys +import argparse +import logging +import json + +# ============================================================================= + +LOG = logging.getLogger('Get PPC PDB IDs with ligand') +FILENAME_with_ions = 'PPC-PROT-COMPOUNDS_3-26-18PDBe' +FILENAME_without_ions = 'PPC-PROT-COMPOUNDS_noIONS_3-26-18PDBe' + +XRAY = ['X-ray diffraction', 'X-ray powder diffraction'] + +RESOLUTION = 3.5 +DIFFR = 0.05 + +# ============================================================================= + +def main(pdbe, ion): + with open(pdbe) as infile: + structure = json.load(infile) + + # Get protein/protein complex (PPC) + ppc = filter_rfactor(filter_resolution(filter_annotation( + get_ppc(structure)))) + to_txt(get_id(ppc), 'PPC_3-26-18PDBe') + + # Get protein structure with [compounds] + prot_compound = filter_rfactor(filter_resolution(filter_annotation( + get_with_compound(get_prot(structure))))) + + # Collect PPC UniProt + ppc_uniprot = get_uniprot(ppc) + + # Get PPC-associated proteins with compounds + ppc_ligand = get_prot_compound_ppc(prot_compound, ppc_uniprot) + to_json(ppc_ligand, FILENAME_with_ions) + to_txt(get_id(ppc_ligand), FILENAME_with_ions) + + # Get PPC-associated proteins with compounds (ions only not included) + ions = [] + with open(ion, 'r') as ion_file: + for i in ion_file: + ions.append(i.strip().upper()) + + prot_compound_without_ions = get_without_ions(prot_compound, ions) + ppc_ligand_without_ions = get_prot_compound_ppc(prot_compound_without_ions, ppc_uniprot) + to_json(ppc_ligand_without_ions, FILENAME_without_ions) + to_txt(get_id(ppc_ligand_without_ions), FILENAME_without_ions) + + + if os.path.exists('{}.txt'.format(FILENAME_with_ions)) and \ + os.path.exists('{}.txt'.format(FILENAME_without_ions)): + LOG.info('Finished!') + else: + LOG.error('Warning! [Outputs] Not found!') + sys.exit(1) + + + +def get_ppc(pdbe): + ppc_unique = [] + ppc_repeated = [] + for i in xrange(len(pdbe)): + if 'assembly_composition' in pdbe[i].keys(): + assembly_composition = pdbe[i]['assembly_composition'] + if len(assembly_composition) == 1 and \ + assembly_composition == ['protein/protein complex']: + ppc_unique.append(pdbe[i]) + elif len(assembly_composition) > 1: + if assembly_composition.count('protein/protein complex') >= 1 and \ + assembly_composition.count( + assembly_composition[0]) == len(assembly_composition): + ppc_repeated.append(pdbe[i]) + ppc_all = ppc_unique + ppc_repeated + return ppc_all + + +def get_prot(pdbe): + prot_unique = [] + prot_repeated = [] + for i in xrange(len(pdbe)): + if 'assembly_composition' in pdbe[i].keys(): + assembly_composition = pdbe[i]['assembly_composition'] + if len(assembly_composition) == 1 and \ + assembly_composition == ['protein structure']: + prot_unique.append(pdbe[i]) + elif len(assembly_composition) > 1: + if assembly_composition.count('protein structure') >= 1 and \ + assembly_composition.count( + assembly_composition[0]) == len(assembly_composition): + prot_repeated.append(pdbe[i]) + prot_all = prot_unique + prot_repeated + return prot_all + + +def filter_annotation(pdbe): + annotation = [] + for i in xrange(len(pdbe)): + if len(pdbe[i]['experimental_method']) == 1: + annotation.append(pdbe[i]) + return annotation + +def filter_resolution(pdbe): + quality = [] + for i in xrange(len(pdbe)): + if len([e for e in pdbe[i]['experimental_method'] if e in XRAY]) == 1: + if 'resolution' in pdbe[i].keys() and \ + float(pdbe[i]['resolution']) <= RESOLUTION: + quality.append(pdbe[i]) + return quality + +def filter_rfactor(pdbe): + quality = [] + for i in xrange(len(pdbe)): + if len([e for e in pdbe[i]['experimental_method'] if e in XRAY]) == 1: + if 'r_free' in pdbe[i].keys() and 'r_factor' in pdbe[i].keys(): + if float(pdbe[i]['r_free'] - pdbe[i]['r_factor']) <= abs(DIFFR): + quality.append(pdbe[i]) + return quality + + +def get_with_compound(pdbe): + compound = [] + for i in xrange(len(pdbe)): + if 'compound_id' in pdbe[i].keys(): + compound.append(pdbe[i]) + return compound + + +def get_uniprot(pdbe): + uniprot = [] + for i in xrange(len(pdbe)): + if 'uniprot_id' in pdbe[i].keys(): + uniprot.extend(pdbe[i]['uniprot_id']) + return uniprot + + +def get_prot_compound_ppc(protein, uniprot): + prot_compound_ppc = [] + for i in xrange(len(protein)): + if 'uniprot_id' in protein[i].keys(): + if len([e for e in protein[i]['uniprot_id'] if e in uniprot]) >= 1: + prot_compound_ppc.append(protein[i]) + return prot_compound_ppc + + +def get_without_ions(protein, ions): + without_ions = [] + with_ions_not_only = [] + for i in xrange(len(protein)): + if 'compound_id' in protein[i].keys(): + if len([e for e in protein[i]['compound_id'] if e in ions]) == 0: + without_ions.append(protein[i]) + elif len([e for e in protein[i]['compound_id'] if e in ions]) >= 1: + if len([e for e in protein[i]['compound_id'] if e in ions]) \ + != len(protein[i]['compound_id']): + with_ions_not_only.append(protein[i]) + print len(without_ions), len(with_ions_not_only) + compound_ions = without_ions + with_ions_not_only + return compound_ions + + +def get_id(pdbe): + pdb_id = [] + for i in xrange(len(pdbe)): + pdb_id.append(pdbe[i]['pdb_id']) + return pdb_id + + +def to_json(data, outfile): + with open('{}.json'.format(outfile), 'wb') as jsonfile: + json.dump(data, jsonfile) + + +def to_txt(data, outfile): + with open('{}.txt'.format(outfile), 'wb') as txtfile: + for pdb_id in data: + txtfile.write(pdb_id + '\n') + + + +def setlogger(): + LOG.setLevel(logging.INFO) + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - \ + %(levelname)s - %(message)s') + ch.setFormatter(formatter) + LOG.addHandler(ch) + +# ============================================================================= + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description = 'Get protein/protein \ + complex PDB IDs from PDBe') + parser.add_argument('pdbe', help = 'Input [.json file]: \ + PBDe (with assembly and structure quality information included') + parser.add_argument('ion', help = 'Input [.txt file]: Compounds considered \ + as ions') + options = parser.parse_args() + setlogger() + main(options.pdbe, options.ion) diff --git a/pyScripts/get_ppc_ligand_compound_from_pdbe.py b/pyScripts/get_ppc_ligand_compound_from_pdbe.py new file mode 100644 index 0000000000000000000000000000000000000000..a91caa2d8a00c3559d5e14213294b8308dff1b43 --- /dev/null +++ b/pyScripts/get_ppc_ligand_compound_from_pdbe.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- + +""" +Author: Alexandra Moine-Franel +Date: February 2018 +Version: 2 + + + +IDENTIFY INTERACTING LIGANDS AND COMPOUNDS OF PROTEIN-PROTEIN COMPLEXES FROM PDBe + + +Usage: [Script.py] [PDBe] +------------------------------------------------------------------ +Argument: + [PDBe]: + Input (json format) + PDBe (with assembly information included) + +Return: + [list(ligands)]: + Output (txt format) + Interacting ligands of protein/protein complexes + + [list(compounds)]: + Output (txt format) + Compounds of protein/protein complexes + + +Warning: +------- +PDBs with unique and repeated assembly composition annotation are both included! +[unique annotation: 'protein/protein complex'] +[repeated annotation: 'protein/protein complex, protein/protein complex'] +""" + +# ============================================================================= + +import argparse +import logging +import json +import pandas as pd + +# ============================================================================= + +LOG = logging.getLogger('Get ligands and compounds of protein/protein complexes from PDBe') + +# ============================================================================= + +def setlogger(): + LOG.setLevel(logging.INFO) + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s') + ch.setFormatter(formatter) + LOG.addHandler(ch) + + + +def main(pdbe): + structure = pd.read_json(pdbe, orient = 'columns') + ligand_to_txt(get_ligand(get_ppc_data(structure))) + compound_to_txt(get_compound(get_ppc_data(structure))) + +def get_ppc_data(pdbe): + ppc_unique_data = [] + ppc_repeated_data = [] + for i in xrange(len(pdbe['grouped']['pdb_id']['groups'])): + assembly_composition = pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['assembly_composition'] + if len(assembly_composition) == 1 and assembly_composition == ['protein/protein complex']: + ppc_unique_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) + elif len(assembly_composition) > 1: + if assembly_composition.count('protein/protein complex') >= 1 and \ + assembly_composition.count(assembly_composition[0]) == len(assembly_composition): + ppc_repeated_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) + ppc_all_data = ppc_unique_data + ppc_repeated_data + return ppc_all_data + + + +def get_ligand(ppc): + ppc_ligand = [] + unique_ligand = [] + for i in xrange(len(ppc)): + if 'interacting_ligands' in ppc[i].keys(): + ppc_ligand.append(ppc[i]['interacting_ligands'][0].split(' :')[0]) + + for j in ppc_ligand: + if j not in unique_ligand: + unique_ligand.append(j) + + #print len(unique_ligand) + return unique_ligand + + + +def get_compound(ppc): + ppc_compound = [] + unique_compound = [] + for i in xrange(len(ppc)): + if 'compound_id' in ppc[i].keys(): + ppc_compound.extend(ppc[i]['compound_id']) + + for j in ppc_compound: + if j not in unique_compound: + unique_compound.append(j) + + #print len(unique_compound) + return unique_compound + + +def ligand_to_txt(ligand): + with open('PPC_LIGANDS_2-20-18PDBe.txt', 'wb') as txtfile: + for lig in ligand: + txtfile.write(lig + '\n') + +def compound_to_txt(compound): + with open('PPC_COMPOUNDS_2-20-18PDBe.txt', 'wb') as txtfile: + for c in compound: + txtfile.write(c + '\n') + +# ============================================================================== + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description = + 'Get ligands and compounds of protein/protein complexes from PDBe') + parser.add_argument('pdbe', help = 'Input [.json file]: \ + PBDe (with interacting ligand and compound information included') + options = parser.parse_args() + setlogger() + main(options.pdbe) diff --git a/pyScripts/get_ppc_protcomp-ion_ppc_from_pdbe.py b/pyScripts/get_ppc_protcomp-ion_ppc_from_pdbe.py new file mode 100644 index 0000000000000000000000000000000000000000..59a2c0ac99f63241860924a3b5001e069f73a7ac --- /dev/null +++ b/pyScripts/get_ppc_protcomp-ion_ppc_from_pdbe.py @@ -0,0 +1,321 @@ +# -*- coding: utf-8 -*- + +""" +Author: Alexandra Moine-Franel +Date: February 2018 +Version: 2 + + + +IDENTIFY PROTEIN STRUCTURE WITH COMPOUNDS (with or without including ions) +INVOLVED IN PROTEIN-PROTEIN INTERACTION FROM PDBe + + +Usage: [Script.py] [PDBe] +------------------------------------------------------------------ +Argument: + [PDBe]: + Input (json format) + PDBe (with [assembly_composition], [compound_id] and [uniprot_id] included) + +Return: + [list(protein)]: + Output (txt format) + Proteins with compounds associated with a protein/protein complex PDB IDs + + [json(protein)]: + Output (json format) + Proteins with compounds associated with a protein/protein complex dataset + + Compounds: + --------- + ** EXAMPLE: [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID'] + or [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID', u'FE : FE (III) ION'] + + + + [list(protein)]: + Output (txt format) + Proteins with compounds associated with a protein/protein complex PDB IDs + IONS ONLY EXCLUDED + + [json(protein)]: + Output (json format) + Proteins with compounds associated with a protein/protein complex dataset + IONS ONLY EXCLUDED + + Compounds: + --------- + ** EXAMPLE: [u'FE : FE (III) ION'] + + + +!Warning!: +------- +PDBs with unique and repeated assembly composition annotation are both included! +[unique annotation: 'protein/protein complex'] +[repeated annotation: 'protein/protein complex, protein/protein complex'] +""" + +# ============================================================================= + +import argparse +import logging +import json +import pandas as pd + +# ============================================================================= + +LOG = logging.getLogger('Proteins associated with a PPC - Compounds') + +# ============================================================================= + +def setlogger(): + LOG.setLevel(logging.INFO) + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - %(levelname)s - %(message)s') + ch.setFormatter(formatter) + LOG.addHandler(ch) + + +def main(pdbe): + structure = pd.read_json(pdbe, orient = 'columns') + + # Get protein/protein complex [assembly_composition] + ppc = get_ppc_data(structure) + + # Get protein structure [assembly_composition] with 'compounds' + prot_compound = get_compound_id(get_prot_data(structure)) + + # Collect PPC UniProt + ppc_uniprot = get_uniprot(ppc) + + # Identify protein structure with compounds associated to PPC + to_txt(get_id_prot_compound_ppc(prot_compound, ppc_uniprot)) + to_json(get_data_prot_compound_ppc(prot_compound, ppc_uniprot)) + + + # Get protein structure [assembly_composition] with compounds (ions only not included) + ion = get_ion(get_compound_name(prot_compound)) + prot_compound_without_ions = get_compound_without_ion(prot_compound, ion) + to_txt_ion(get_id_prot_compound_ppc(prot_compound_without_ions, ppc_uniprot)) + to_json_ion(get_data_prot_compound_ppc(prot_compound_without_ions, ppc_uniprot)) + + + + +def get_ppc_data(pdbe): + ppc_unique_data = [] + ppc_repeated_data = [] + for i in xrange(len(pdbe['grouped']['pdb_id']['groups'])): + assembly_composition = pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['assembly_composition'] + if len(assembly_composition) == 1 and assembly_composition == ['protein/protein complex']: + ppc_unique_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) + elif len(assembly_composition) > 1: + if assembly_composition.count('protein/protein complex') >= 1 and \ + assembly_composition.count(assembly_composition[0]) == len(assembly_composition): + ppc_repeated_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) + ppc_all_data = ppc_unique_data + ppc_repeated_data + return ppc_all_data + + + +def get_prot_data(pdbe): + prot_unique_data = [] + prot_repeated_data = [] + for i in xrange(len(pdbe['grouped']['pdb_id']['groups'])): + assembly_composition = pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]['assembly_composition'] + if len(assembly_composition) == 1 and assembly_composition == ['protein structure']: + prot_unique_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) + elif len(assembly_composition) > 1: + if assembly_composition.count('protein structure') >= 1 and \ + assembly_composition.count(assembly_composition[0]) == len(assembly_composition): + prot_repeated_data.append(pdbe['grouped']['pdb_id']['groups'][i]['doclist']['docs'][0]) + prot_all_data = prot_unique_data + prot_repeated_data + return prot_all_data + + + +def get_compound_id(data): + ''' + Get 3D structure with compound [compound_id] + -------------------------------------------------- + Argument: + [json]: protein structures (dataset) + + Return: + [json]: protein structures with compounds (dataset) + ''' + compound = [] + for i in xrange(len(data)): + if 'compound_id' in data[i].keys(): + compound.append(data[i]) + return compound + + + +def get_uniprot(data): + ''' + Get UniProt + ------------------------------------------------- + Argument: + [json]: protein/protein complexes (dataset) + + Return: + [list]: protein/protein complexes Uniprot IDs + ''' + uniprot = [] + for i in xrange(len(data)): + if 'uniprot_id' in data[i].keys(): + uniprot.extend(data[i]['uniprot_id']) + return uniprot + + +def get_id_prot_compound_ppc(data, uniprot): + ''' + Get PDB IDs only of protein structure with compounds associated with a protein/protein complex + ----------------------------------------------------------------------------------------------- + Arguments: + [json]: protein structures with compounds (dataset) + [list]: protein/protein complex uniprot id + + Return: + [list]: protein structure with compounds associated with a protein/protein complex PDB IDs + ''' + prot_compound_pcc = [] + for i in xrange(len(data)): + if 'uniprot_id' in data[i].keys(): + if len([e for e in data[i]['uniprot_id'] if e in uniprot]) >= 1: + prot_compound_pcc.append(data[i]['pdb_id']) + return prot_compound_pcc + + + +def get_data_prot_compound_ppc(data, uniprot): + ''' + Get data only of protein structure with compounds associated with a protein/protein complex + ----------------------------------------------------------------------------------------------- + Arguments: + [json]: protein structures with compounds (dataset) + [list]: protein/protein complex uniprot id + + Return: + [json]: protein structure with compounds associated with a protein/protein complex (dataset) + ''' + prot_compound_pcc_data = [] + for i in xrange(len(data)): + if 'uniprot_id' in data[i].keys(): + if len([e for e in data[i]['uniprot_id'] if e in uniprot]) >= 1: + prot_compound_pcc_data.append(data[i]) + return prot_compound_pcc_data + + + + +def get_compound_name(data): + ''' + Get protein compound names + -------------------------------------------------- + Argument: + [json]: protein structures with compounds (dataset) + + Return: + [list]: compound names + ** EXAMPLE: [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID', u'FE : FE (III) ION'] + ''' + compound = [] + for i in xrange(len(data)): + if 'compound_name' in data[i].keys(): + compound.extend(data[i]['compound_name']) + return compound + + + +def get_ion(compound_name): + ''' + Get protein ions + -------------------------------------------------- + Argument: + [list]: compound names + ** EXAMPLE: [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID', u'FE : FE (III) ION'] + + Return: + [list]: ions + ** EXAMPLE: ['FE'] + ''' + ion = [] + ion_unique = [] + for i in xrange(len(compound_name)): + if 'ION' in compound_name[i].split(' :')[1].split(' '): + ion.append(compound_name[i].split(' :')[0]) + for j in ion: + if j not in ion_unique: + ion_unique.append(j) + print ion_unique + return ion_unique + + + +def get_compound_without_ion(data, ions): + ''' + Get 3D structure with compound [compound_id], ion excluded + ---------------------------------------------------------- + Arguments: + [json]: protein structures (dataset) + [list]: ions + + Return: + [json]: protein structures with compounds != ions (dataset) + + !Warning! + --------- + Protein structures either with only compounds, or compounds and ions are included + ** EXAMPLE: [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID'] + or [u'FHB : 3-FLUORO-4-HYDROXYBENZOIC ACID', u'FE : FE (III) ION'] + + Only protein structures with only ions are excluded + ** EXAMPLE: [u'FE : FE (III) ION'] + ''' + compound_without_ion = [] + compound_with_ion = [] + for i in xrange(len(data)): + if 'compound_id' in data[i].keys(): + if len([e for e in data[i]['compound_id'] if e in ions]) == 0: + compound_without_ion.append(data[i]) + elif len([e for e in data[i]['compound_id'] if e in ions]) >= 1: + if len([e for e in data[i]['compound_id'] if e in ions]) != len(data[i]['compound_id']): + compound_with_ion.append(data[i]) + compound_ion = compound_without_ion + compound_with_ion + return compound_ion + + + +def to_txt(index): + with open('PROT-COMPOUNDS_PPC_2-20-18PDBe.txt', 'wb') as txtfile: + for i in index: + txtfile.write(i + '\n') + +def to_json(index): + with open('PROT-COMPOUNDS_PPC_2-20-18PDBe.json', 'wb') as jsonfile: + json.dump(index, jsonfile) + +def to_txt_ion(index): + with open('PROT-COMPOUNDS-noIONS_PPC_2-20-18PDBe.txt', 'wb') as txtfile: + for i in index: + txtfile.write(i + '\n') + +def to_json_ion(index): + with open('PROT-COMPOUNDS-noIONS_PPC_2-20-18PDBe.json', 'wb') as jsonfile: + json.dump(index, jsonfile) + +# ============================================================================== + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description = + 'IDENTIFY PROTEIN STRUCTURE WITH COMPOUNDS INVOLVED IN PROTEIN-PROTEIN INTERACTION FROM PDBe') + parser.add_argument('pdbe', help = 'Input [.json file]: \ + PBDe (with [assembly_composition], [compound_id] and [uniprot_id] included)') + options = parser.parse_args() + setlogger() + main(options.pdbe) diff --git a/pyScripts/iPPI-DB_descriptors_V2.py b/pyScripts/iPPI-DB_descriptors_V2.py new file mode 100644 index 0000000000000000000000000000000000000000..ef516ce6ee41b26aecaada0d83730c0d57572e08 --- /dev/null +++ b/pyScripts/iPPI-DB_descriptors_V2.py @@ -0,0 +1,320 @@ +# -*- coding: utf-8 -*- + +""" +Author: Alexandra Moine-Franel +Date: January 2018 +Version: 2 + + + +CALCULATE 2D/3D POCKET DESCRIPTORS FROM VOLSITE + + +Usage: [Script.py] [PDB_descriptor.txt] [PDB_2D-3Ddescriptors.csv] +------------------------------------------------------------------ + +[PDB_descriptor.txt]: + Input (txt format) + Pocket descriptors calculated by VolSite + +[PDB_2D-3Ddescriptors.csv]: + Output (csv format) + Pocket descriptors including: + * 89 descriptors from VolSite + * 10 descriptors using a linear combination of VolSite descriptors + (Kuenemann, 2016 Scientific Reports) + * 10 geometric descriptors from RDKit toolkit + (http://www.rdkit.org/docs/Overview.html, version 2017.09.1) +""" + +# ============================================================================= + +import os +import sys +import argparse +import logging +import csv +import pandas as pd +from rdkit import Chem +from rdkit.Chem import Descriptors3D + +# ============================================================================= + +LOG = logging.getLogger("CALCULATE 2D/3D POCKET DESCRIPTORS FROM VOLSITE") + + +VOLSITE_HEADER = [ + 'Volume', 'CZ', 'CA', 'O', 'OD1', 'OG', 'N', 'NZ', 'DU', + 'CZ40', 'CZ40-50', 'CZ50-60', 'CZ60-70', 'CZ70-80', 'CZ80-90', 'CZ90-100', 'CZ100-110', 'CZ110-120', 'CZ120', + 'CA40', 'CA40-50', 'CA50-60', 'CA60-70', 'CA70-80', 'CA80-90', 'CA90-100', 'CA100-110', 'CA110-120', 'CA120', + 'O40', 'O40-50', 'O50-60', 'O60-70', 'O70-80', 'O80-90', 'O90-100', 'O100-110', 'O110-120', 'O120', + 'OD140', 'OD140-50', 'OD150-60', 'OD160-70', 'OD170-80', 'OD180-90', 'OD190-100', 'OD1100-110', 'OD1110-120', + 'OD1120', + 'OG40', 'OG40-50', 'OG50-60', 'OG60-70', 'OG70-80', 'OG80-90', 'OG90-100', 'OG100-110', 'OG110-120', 'OG120', + 'N40', 'N40-50', 'N50-60', 'N60-70', 'N70-80', 'N80-90', 'N90-100', 'N100-110', 'N110-120', 'N120', + 'NZ40', 'NZ40-50', 'NZ50-60', 'NZ60-70', 'NZ70-80', 'NZ80-90', 'NZ90-100', 'NZ100-110', 'NZ110-120', 'NZ120', + 'DU40', 'DU40-50', 'DU50-60', 'DU60-70', 'DU70-80', 'DU80-90', 'DU90-100', 'DU100-110', 'DU110-120', 'DU120', + 'POCKET' +] + +DESC_COMBINATION = { + 'T40': ['CZ40', 'CA40', 'O40', 'OD140', 'OG40', 'N40', 'NZ40', 'DU40'], + 'T40-50': ['CZ40-50', 'CA40-50', 'O40-50', 'OD140-50', 'OG40-50', 'N40-50', 'NZ40-50', 'DU40-50'], + 'T50-60': ['CZ50-60', 'CA50-60', 'O50-60', 'OD150-60', 'OG50-60', 'N50-60', 'NZ50-60', 'DU50-60'], + 'T60-70': ['CZ60-70', 'CA60-70', 'O60-70', 'OD160-70', 'OG60-70', 'N60-70', 'NZ60-70', 'DU60-70'], + 'T70-80': ['CZ70-80', 'CA70-80', 'O70-80', 'OD170-80', 'OG70-80', 'N70-80', 'NZ70-80', 'DU70-80'], + 'T80-90': ['CZ80-90', 'CA80-90', 'O80-90', 'OD180-90', 'OG80-90', 'N80-90', 'NZ80-90', 'DU80-90'], + 'T90-100': ['CZ90-100', 'CA90-100', 'O90-100', 'OD190-100', 'OG90-100', 'N90-100', 'NZ90-100', 'DU90-100'], + 'T100-110': ['CZ100-110', 'CA100-110', 'O100-110', 'OD1100-110', 'OG100-110', 'N100-110', 'NZ100-110', 'DU100-110'], + 'T110-120': ['CZ110-120', 'CA110-120', 'O110-120', 'OD1110-120', 'OG110-120', 'N110-120', 'NZ110-120', 'DU110-120'], +} + +DESC_GEOM = [ + 'PMI1', 'PMI2', 'PMI3', 'NPR1', 'NPR2', 'Rgyr', + 'Asphericity', 'SpherocityIndex', 'Eccentricity', 'InertialShapeFactor' +] + +# ============================================================================= + +def setlogger(): + LOG.setLevel(logging.INFO) + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - \ + %(levelname)s - %(message)s') + ch.setFormatter(formatter) + LOG.addHandler(ch) + + +def main(volsitedescriptor, pocketdescriptor): + volsite_csv = descriptor_volsite(volsitedescriptor) + combination_csv = descriptor_combination(volsite_csv) + volsite_combination_csv = "{}volsite+combinaison.csv".format( + volsite_csv.rsplit("volsite.csv")[0]) + merge_csv(volsite_csv, combination_csv, volsite_combination_csv) + geometric_csv = descriptor_geometric(volsitedescriptor) + merge_csv(volsite_combination_csv, geometric_csv, pocketdescriptor) + + if os.path.exists(pocketdescriptor): + os.remove(volsite_csv) + os.remove(combination_csv) + os.remove(volsite_combination_csv) + os.remove(geometric_csv) + LOG.info('Finished!') + else: + LOG.error('Error: 2D/3D Pocket Descriptor file not found') + sys.exit(1) + + +def descriptor_volsite(txtfile): + """ + Convert VolSite output file from txt format to csv format + (descriptor headers added) + --------------------------------------------------------- + Argument: + [file]: VolSite descriptors (txt format) + + Return: + [file]: VolSite descriptors (csv format) + with descriptor headers + """ + try: + with open(txtfile, 'rb') as infile: + volsite_csv = "{}_volsite.csv".format(os.path.splitext(txtfile)[0]) + with open(volsite_csv, 'wb') as outfile: + in_text = csv.reader(infile, delimiter=' ') + out_csv = csv.writer(outfile) + out_csv.writerow(VOLSITE_HEADER) + for line in in_text: + out_csv.writerow(line) + except IOError, e: + LOG.error('{}'.format(e)) + sys.exit(1) + return volsite_csv + + +def descriptor_combination(volsite_descriptor_csv): + """ + Calculate and write linear combinations of VolSite descriptors to csv file + (Kuenemann, 2016 Scientific Reports) + --------------------------------------------------------------------------- + Argument: + [file]: VolSite descriptors (csv format) + + Return: + [file]: linear combinations of VolSite descriptors (csv format) + """ + with open(volsite_descriptor_csv, 'rb') as infile: + volsite = csv.reader(infile, delimiter = ',') + descriptors = volsite.next() + + # Create a dictionnary + # key(descriptor): values(list of all pocket values) + data = {} + # Count the number of pockets + numbpocket = 0 + for header in descriptors: + data[header] = [] + for pocket in volsite: + numbpocket += + 1 + for header, value in zip(descriptors, pocket): + data[header].append(value) + combination_csv = "{}combination.csv".format( + volsite_descriptor_csv.rsplit("volsite.csv")[0]) + with open(combination_csv, 'wb') as outfile: + combivolsite = csv.writer(outfile, delimiter = ',', + lineterminator = '\n') + # Descriptor headers and values have to be in variables + # to write them in a new file column by column (sep = comma) + descname = [] + cdata = [] + for dch in DESC_COMBINATION.keys(): + # dch = descriptor combination header (column) + bsum = [] + for p in range(numbpocket): + # p = pocket (row) + buriedness_values = [] + for colname in DESC_COMBINATION[dch]: + # Get buriedness value by descriptors + buriedness_values.append(float(data[colname][p])) + # Get sum of buriedness value by descriptor combinations + bsum.append(sum(buriedness_values)) + descname.append(dch) + cdata.append(bsum) + combivolsite.writerow(descname) + datatransposed = zip(*cdata) + for row in datatransposed: + combivolsite.writerow(row) + return combination_csv + + +def descriptor_geometric(txtfile): + """ + Write geometric descriptor from the negative image of binding pockets + detected by VolSite to csv file + --------------------------------------------------------------------------- + Argument: + [file]: VolSite descriptors (txt format) + + Return: + [file]: shape descriptors (csv format) + """ + with open(txtfile, 'rb') as infile: + pocketvolsite = csv.reader(infile, delimiter = ',') + # Get the number of pockets + numbpocket = 0 + for pocket in pocketvolsite: + numbpocket += + 1 + geometric_csv = "{}geometric.csv".format(os.path.splitext(txtfile)[0]) + with open(geometric_csv, 'wb') as outfile: + shape = csv.writer(outfile, delimiter = ',', lineterminator = '\n') + shape.writerow(DESC_GEOM) + # Calculate geometric descriptors pocket by pockets + for cavity in range(1, numbpocket + 1): + pocket_mol2 = "{}_CAVITY_N{}_ALL.mol2".format( + geometric_csv.rsplit("_")[0], cavity) + replace_mol2atom(pocket_mol2) + shape.writerow(calculate_shape_descriptor( + "temp_{}_CAVITY_N{}_ALL.mol2".format( + geometric_csv.rsplit("_")[0], cavity))) + os.remove("temp_{}_CAVITY_N{}_ALL.mol2".format( + geometric_csv.rsplit("_")[0], cavity)) + return geometric_csv + + +def replace_mol2atom(cavity): + """ + Replace the atom name assigned by VolSite to the probes by 'C' (carbone) + + Warning 1 + Fixing the RDKit error: 'atom' with a degree > 1 + + Warning 2 + RDKit needs the output file [PDB_CAVITY_Nx_ALL_temp.mol2] + NOT the link + ------------------------------------------------------------------ + Argument: + [file]: negative image of a binding pocket (mol2 format) + + Return: + [file]: negative image of a binding pocket with carbone as atom name + (mol2 format, temporary file used as input by RDKit) + """ + header = True + parsing = False + with open(cavity, 'rb') as infile: + with open("temp_{}.mol2".format(cavity.rsplit('.')[0]), 'wb') as tempf: + for line in infile: + if line.startswith('@<TRIPOS>ATOM'): + tempf.write(line)# + '\n') + header = False + parsing = True + continue + elif line.startswith('@<TRIPOS>BOND'): + parsing = False + + if header: + tempf.write(line) + if parsing: + tempf.write(line.replace(line.split()[5], 'C')) + #return tempf + + +def calculate_shape_descriptor(cavity): + """ + Calculate geometric descriptors with RDKit + ------------------------------------------ + Geometric descriptor details: + http://www.rdkit.org/Python_Docs/rdkit.Chem.Descriptors3D-module.html + + Argument: + [file]: negative image of a binding pocket (mol2 format) + + Return: + [list]: geometric descriptors + """ + mol2file = Chem.MolFromMol2File(cavity, sanitize = False, removeHs = False) + geom_val = [] + geom_val.append(Descriptors3D.PMI1(mol2file)) + geom_val.append(Descriptors3D.PMI2(mol2file)) + geom_val.append(Descriptors3D.PMI3(mol2file)) + geom_val.append(Descriptors3D.NPR1(mol2file)) + geom_val.append(Descriptors3D.NPR2(mol2file)) + geom_val.append(Descriptors3D.RadiusOfGyration(mol2file)) + geom_val.append(Descriptors3D.Asphericity(mol2file)) + geom_val.append(Descriptors3D.SpherocityIndex(mol2file)) + geom_val.append(Descriptors3D.Eccentricity(mol2file)) + geom_val.append(Descriptors3D.InertialShapeFactor(mol2file)) + return geom_val + + +def merge_csv(csv_infile1, csv_infile2, csv_outfile): + """ + Concatenate two csv files + ------------------------- + Arguments: + [file]: input (csv format) + [file]: input (csv format) + [file]: output filename (csv format) + """ + csv1 = pd.read_csv(csv_infile1) + csv2 = pd.read_csv(csv_infile2) + csv3 = pd.concat([csv1, csv2], axis = 1) + return csv3.to_csv(csv_outfile, sep = ',', index = False) + +# ============================================================================== + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description = + 'Calculate 2D/3D Pocket Descriptors from VolSite') + parser.add_argument('volsitedescriptor', help = 'Input [.txt file]: \ + Pocket descriptors calculated by VolSite') + parser.add_argument('pocketdescriptor', help = 'Output [.csv filename]: \ + Pocket descriptors including \ + 89 descriptors from VolSite, \ + 10 descriptors using a combination of VolSite descriptors \ + and 10 geometric descriptors') + + options = parser.parse_args() + setlogger() + main(options.volsitedescriptor, options.pocketdescriptor) diff --git a/pyScripts/ion.pdf b/pyScripts/ion.pdf new file mode 100644 index 0000000000000000000000000000000000000000..46ecfa3582acda4ea7252068a70a9f9a1259d6f8 Binary files /dev/null and b/pyScripts/ion.pdf differ diff --git a/pyScripts/ion.txt b/pyScripts/ion.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2ec67982891e76539ee9654375ae4c8acb73de8 --- /dev/null +++ b/pyScripts/ion.txt @@ -0,0 +1,63 @@ + +3NI +4MO +4PU +4TI +6MO +AG +AL +AU +AU3 +AUC +BA +BR +BS3 +CA +CD +CE +CF +CL +CO +CR +CS +CU +CU1 +ER3 +EU +EU3 +F +FE +FE2 +GA +HG +IOD +IR +IR3 +K +LA +LI +LU +MG +MN +MN3 +MOO +NA +NI +PD +PR +PT +PT4 +RB +RU +SB +SE4 +SM +SR +TB +TH +THE +TL +W +ZN +ZR + diff --git a/pyScripts/ppiinterface_dist.py b/pyScripts/ppiinterface_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..e79e69ac85a5c9070b319f5dfa89d29c2570756f --- /dev/null +++ b/pyScripts/ppiinterface_dist.py @@ -0,0 +1,335 @@ +# -*- coding: utf-8 -*- + +""" +Author: Alexandra Moine-Franel +Date: February 2018 +Version: 4 + + +IDENTIFY PROTEIN INTERFACE / POCKET RESIDUES + +Determine protein residues at a specific distance either from protein partner, +ligand (small molecule) or negative image of the pocket generated by VolSite. + + + +Usage: [Script.py] [TARGET] [PARTNER] -d (DISTANCE) +------------------------------------------------------------------------------- +[TARGET]: + Input (.mol2 or .pdb format) + Protein CHAIN + +[PARTNER]: + Input (.mol2 or .pdb format) + Negative image of the pocket generated by VolSite (.mol2 format) + OR Ligand (.pdb format) + OR Protein (.pdb format) + +(DISTANCE)- optionnal + Input (float or integer) + Distance threshold between the target and the partner + * Distance target-protein/ligand = 5 angstroms - by defaut + * Distance target-cavity = 3.5 angstroms - by default + +------------------ + +[POCKET_RESIDUES]: + Output (.txt format) + 'PDB-targetchain-partnerchain_distance.txt' (target-protein/ligand) + OR 'PDB-targetchain-CAVITY_Nx_ALL_distance.txt' (target-cavity) +""" + +# ============================================================================= + +import csv +import os +import sys +import argparse +import logging +#import time +from biopandas.mol2 import PandasMol2 +from biopandas.pdb import PandasPdb + +# ============================================================================= + +LOG = logging.getLogger("IDENTIFY PROTEIN INTERFACE / POCKET RESIDUES") + +# ============================================================================= + +def main(target, partner, distance): + #start = time.time() + if target and partner: + if os.path.splitext(target)[-1] == '.mol2' and \ + os.path.splitext(partner)[-1] == '.mol2': + mol2target = parse_mol2(target) + mol2partner = parse_mol2(partner) + mol2_respocket(mol2target, mol2partner, distance) + if os.path.exists('{}-{}_{}.txt'.format( \ + mol2target.code, mol2partner.code.rsplit('.mol2')[0], \ + distance)): + LOG.info('Finished!') + #end = time.time() + #print(end - start) + else: + LOG.error('Warning! [Output] Not found!') + sys.exit(1) + elif os.path.splitext(target)[-1] == '.pdb' and \ + os.path.splitext(partner)[-1] == '.pdb': + pdbtarget = parse_pdb(target) + pdbpartner = parse_pdb(partner) + if pdbpartner.df['ATOM'].empty == True: + pdb_resinterfacePL(pdbtarget, pdbpartner, distance) + if os.path.exists('{}-{}-{}_{}.txt'.format(pdbtarget.code, \ + pdbtarget.df['ATOM']['chain_id'][0], pdbpartner.df['HETATM']['residue_name'][0], \ + distance)): + LOG.info('Finished!') + #end = time.time() + #print(end - start) + else: + LOG.error('Warning! [Output] Not found!') + sys.exit(1) + else: + pdb_resinterfacePP(pdbtarget, pdbpartner, distance) + if os.path.exists('{}-{}-{}_{}.txt'.format(pdbtarget.code, \ + pdbtarget.df['ATOM']['chain_id'][0], pdbpartner.df['ATOM']['chain_id'][0], \ + distance)): + LOG.info('Finished!') + #end = time.time() + #print(end - start) + else: + LOG.error('Warning! [Output] Not found!') + sys.exit(1) + else: + LOG.error('Error! [TARGET] or [PARTNER] file not in .mol2 or .pdb format') + sys.exit(1) + elif target and not partner: + parser.error('Error: [PARTNER: protein, ligand or cavity] is required') + + + + +def parse_mol2(mol2file): + """ + Parse .mol2 file + --------------------------------------------------------------------------- + Argument: [file]: .mol2 format + Return: [dataframe] + """ + try: + pmol = PandasMol2().read_mol2(mol2file) + except(IOError), e: + LOG.error('{}'.format(e)) + sys.exit(1) + else: + return pmol + + +def parse_pdb(pdbfile): + """ + Parse .pdb file + --------------------------------------------------------------------------- + Argument: [file]: .pdb format + Return: [dataframe] + """ + try: + ppdb = PandasPdb().read_pdb(pdbfile) + except(IOError), e: + LOG.error('{}'.format(e)) + sys.exit(1) + else: + return ppdb + + +def mol2_respocket(mol2target, mol2partner, distance): + """ + Identify protein target residues at a specified distance from its partner + (i.e. the cavity negative image generated by VolSite). + --------------------------------------------------------------------------- + Arguments: + [file]: protein target (.mol2 format) + [file]: negative image of the binding pocket (.mol2 format) + [float]: distance threshold (by default, 3.5 angstroms) + Return: + [file]: pocket residues (.txt format) + """ + ppires = [] + with open('{}-{}_{}.txt'.format(mol2target.code, \ + mol2partner.code.rsplit('.mol2')[0], \ + distance), 'wb') as outfile: + for probe in xrange(len(mol2partner.df.atom_id)): + for atom in xrange(len(mol2target.df.atom_id)): + if mol2target.df.subst_name[atom] not in ppires: + xp = mol2target.df['x'][atom] + xc = mol2partner.df['x'][probe] + distX = x_dist(xp, xc) + if distX < float(distance): + yp = mol2target.df['y'][atom] + yc = mol2partner.df['y'][probe] + distXY = xy_dist(xp, yp, xc, yc) + if distXY < float(distance): + zp = mol2target.df['z'][atom] + zc = mol2partner.df['z'][probe] + distXYZ = euclidian_dist(xp, yp, zp, xc, yc, zc) + if distXYZ < float(distance): + ppires.append(mol2target.df.subst_name[atom]) + for resp in ppires: + outfile.write(resp + '\n') + + +def pdb_resinterfacePP(pdbtarget, pdbpartner, distance): + """ + Identify protein target residues at a specified distance from its partner + (i.e. protein) + --------------------------------------------------------------------------- + Arguments: + [file]: protein target (.pdb format) + [file]: protein partner (.pdb format) + [float]: distance threshold (by default, 5 angstroms) + Return: + [file]: interface residues (.txt format) + """ + ppires = [] + with open('{}-{}-{}_{}.txt'.format(pdbtarget.code, \ + pdbtarget.df['ATOM']['chain_id'][0], pdbpartner.df['ATOM']['chain_id'][0], \ + distance), 'wb') as outfile: + for atomt in xrange(len(pdbpartner.df['ATOM']['atom_number'])): + for atomp in xrange(len(pdbtarget.df['ATOM']['atom_number'])): + if ''.join(map(str, (pdbtarget.df['ATOM']['residue_name'][atomp], \ + pdbtarget.df['ATOM']['residue_number'][atomp]))) not in ppires: + xp = pdbtarget.df['ATOM']['x_coord'][atomp] + xc = pdbpartner.df['ATOM']['x_coord'][atomt] + distX = x_dist(xp, xc) + if distX < float(distance): + yp = pdbtarget.df['ATOM']['y_coord'][atomp] + yc = pdbpartner.df['ATOM']['y_coord'][atomt] + distXY = xy_dist(xp, yp, xc, yc) + if distXY < float(distance): + zp = pdbtarget.df['ATOM']['z_coord'][atomp] + zc = pdbpartner.df['ATOM']['z_coord'][atomt] + distXYZ = euclidian_dist(xp, yp, zp, xc, yc, zc) + if distXYZ < float(distance): + ppires.append(''.join(map(str, \ + (pdbtarget.df['ATOM']['residue_name'][atomp], \ + pdbtarget.df['ATOM']['residue_number'][atomp])))) + for resp in ppires: + outfile.write(resp + '\n') + + +def pdb_resinterfacePL(pdbtarget, pdbpartner, distance): + """ + Identify protein target residues at a specified distance from its partner + (i.e. ligand) + --------------------------------------------------------------------------- + Arguments: + [file]: protein target (.pdb format) + [file]: ligand (.pdb format) + [float]: distance threshold (by default, 5 angstroms) + Return: + [file]: pocket residues (.txt format) + """ + ppires = [] + with open('{}-{}-{}_{}.txt'.format(pdbtarget.code, \ + pdbtarget.df['ATOM']['chain_id'][0], pdbpartner.df['HETATM']['residue_name'][0], \ + distance), 'wb') as outfile: + for atomt in xrange(len(pdbpartner.df['HETATM']['atom_number'])): + for atomp in xrange(len(pdbtarget.df['ATOM']['atom_number'])): + if ''.join(map(str, (pdbtarget.df['ATOM']['residue_name'][atomp], \ + pdbtarget.df['ATOM']['residue_number'][atomp]))) not in ppires: + xp = pdbtarget.df['ATOM']['x_coord'][atomp] + xc = pdbpartner.df['HETATM']['x_coord'][atomt] + distX = x_dist(xp, xc) + if distX < float(distance): + yp = pdbtarget.df['ATOM']['y_coord'][atomp] + yc = pdbpartner.df['HETATM']['y_coord'][atomt] + + distXY = xy_dist(xp, yp, xc, yc) + if distXY < float(distance): + zp = pdbtarget.df['ATOM']['z_coord'][atomp] + zc = pdbpartner.df['HETATM']['z_coord'][atomt] + distXYZ = euclidian_dist(xp, yp, zp, xc, yc, zc) + if distXYZ < float(distance): + ppires.append(''.join(map(str, \ + (pdbtarget.df['ATOM']['residue_name'][atomp], \ + pdbtarget.df['ATOM']['residue_number'][atomp])))) + for resp in ppires: + outfile.write(resp + '\n') + + +def x_dist(x1, x2): + """ + Calculate the distance x between two 1D points + ---------------------------------------------------------------------- + Arguments: + [float]: coordinate x of target atom + coordinate x of partnet atom + Return: + [float]: distance + """ + dist = ((x1-x2)**2) ** 0.5 + return dist + + +def xy_dist(x1, y1, x2, y2): + """ + Calculate the distance xy between two 2D points + ---------------------------------------------------------------------- + Arguments: + [float]: coordinates xy of target atom + coordinates xy of partnet atom + Return: + [float]: distance + """ + dist = ((x1-x2)**2 + (y1-y2)**2) ** 0.5 + return dist + + +def euclidian_dist(x1, y1, z1, x2, y2, z2): + """ + Calculate the euclidian distance xyz between two 3D points + ---------------------------------------------------------------------- + Arguments: + [float]: coordinates xyz of target atom + coordinates xyz of partnet atom + Return: + [float]: distance + """ + dist = ((x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2) ** 0.5 + return dist + + + +def setlogger(): + LOG.setLevel(logging.INFO) + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(funcName)s - \ + %(levelname)s - %(message)s') + ch.setFormatter(formatter) + LOG.addHandler(ch) + + +def default_distance(options): + if os.path.splitext(options.partner)[-1] == '.mol2': + options.distance = 3.5 + elif os.path.splitext(options.partner)[-1] == '.pdb': + options.distance = 5.0 + return options.distance + +# ============================================================================== + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description = 'Identify interface/pocket residues') + parser.add_argument('target', \ + help = 'Input [.mol2 or .pdb file]: protein target') + parser.add_argument('partner', \ + help = 'Input [.pdb file]: protein partner or ligand; OR [.mol2 file]: cavity') + parser.add_argument('-d', dest = 'distance', type = float, \ + help = 'Input [float or integer]: distance threshold \ + {by default, target-protein/ligand = 5A; target-cavity = 3.5A}') + options = parser.parse_args() + + if options.distance is None: + default_distance(options) + + setlogger() + main(options.target, options.partner, options.distance)