Skip to content
Snippets Groups Projects
Commit 5fc82e9e authored by Thomas  BIGOT's avatar Thomas BIGOT
Browse files

Init

parents
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
import argparse,fastaHmmr
parser = argparse.ArgumentParser()
parser.add_argument("fastaFile", help="original set of sequences")
parser.add_argument("--fractionID", help="Fraction identity",default=1.0)
parser.add_argument("--fractionCov", default=None)
parser.add_argument("--cores", default=1)
args = parser.parse_args()
fastaHmmr.collapseSequence(args.fastaFile,args.fractionID,args.fractionCov,args.cores)
This diff is collapsed.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import glob,re,argparse
#from taxadb.schema import *
#from taxadb import accession
#from taxadb import taxid
parser = argparse.ArgumentParser()
parser.add_argument("hmmDB", help="hmm file")
parser.add_argument("msaDir", help="directory containing .msa.faa")
parser.add_argument("taxadb", help="taxadb file")
args = parser.parse_args()
#on ouvre le fichier refseq viral
allHMMfile = open(args.hmmDB)
# pour chaque hmm, on extrait les infos dans un dico
print("Reading RefSeqViral.hmm")
currLength = 0
allHMM = {}
currID = 0
#limit = 10
for line in allHMMfile:
if line.startswith("LENG"):
currLength = int(re.search('LENG\s+([0-9]+)',line).group(1))
if line.startswith("NAME"):
currID = re.search('NAME.*(FAM[0-9]+).hmm',line).group(1)
if line.startswith("//"):
allHMM[currID] = {"name": currID, "length": currLength}
#limit -=1
#if limit == 0:
#break
print("Done !")
print("\nReading each alignement file")
#On ouvre le fichier d alignements
# pour chaque alignement, on liste les espèces et les «fonctions»
for currCluster in allHMM.keys():
currMSA = open(args.msaDir + "/" + str(currCluster) + ".msa.faa")
allHMM[currCluster]["annotations"] = []
for line in currMSA:
if line.startswith(">"):
allHMM[currCluster]["annotations"].append(line)
currMSA.close()
print("Done !")
print("\nReading acc number from gathered data")
#get acc
for currCluster in allHMM.keys():
accs = []
for ca in allHMM[currCluster]["annotations"]:
print(ca)
accs.append(ca.split("|")[3].split(".")[0])
#allHMM[currCluster]["taxid"] = accession.taxid(accs, args.taxadb, Prot)
allHMM[currCluster]["taxid"] = "1234"
# from taxid family
allHMM[currCluster]["families"] = {}
for ct in allHMM[currCluster]["taxid"]:
#lineage = taxid.lineage_name(ct[1], args.taxadb)
lineage = (1234,"ABCD")
family = lineage[2]
if family not in allHMM[currCluster]["families"].keys():
allHMM[currCluster]["families"][family] = 1
else:
allHMM[currCluster]["families"][family] += 1
print("Done !")
print("\nWriting files")
#on output tout par cluster
for currCluster in allHMM.keys():
currAnnot = open("annotations/" + str(currCluster)+ "_annotations.txt","w")
currAnnot.write("LENGTH\t" + str(allHMM[currCluster]["length"])+"\n")
currAnnot.write("FAMILIES\t" + str(allHMM[currCluster]["families"]) + "\n")
currAnnot.write("FASTA SEQUENCE TITLES:\n")
for currST in allHMM[currCluster]["annotations"]:
currAnnot.write(currST+"\n")
#!/usr/bin/env python3
import sys,subprocess,argparse
from io import StringIO
parser = argparse.ArgumentParser()
parser.add_argument("master", help="fasta containing all sequences")
parser.add_argument("silix", help="result of Silix computation")
parser.add_argument("destination", help="directory in which create new fasta files (must exist, should be empty)")
args = parser.parse_args()
# load clusters in memory
clusters = open(args.silix)
seqToFam = {}
for seq in clusters:
ids = seq.strip().split()
seqToFam[ids[1]] = ids[0]
clusters.close()
def addSeqToFamilyFile(name,sequence):
if len(name) == 0:
return
family = seqToFam[name.split()[0]]
ofasta = open(args.destination + "/" + family + ".fasta","a")
ofasta.write(">" + name + "\n" + "".join(sequence))
# now sorting the sequences of fasta
currName = ""
currSequence = []
sequences = open(args.master)
for line in sequences:
if line.startswith(">"):
addSeqToFamilyFile(currName,currSequence)
currName = line.strip()[1:]
currSequence = []
else:
currSequence.append(line)
#!/usr/bin/env python3
import sys,subprocess,argparse
from io import StringIO
parser = argparse.ArgumentParser()
parser.add_argument("list", help="file containing list of nucleic accession numbers")
parser.add_argument("dbName", help="prefix for output files")
parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)")
args = parser.parse_args()
p = args.p
ntf = open(args.list)
dbName = args.dbName
of = open(dbName+"_"+ p +"_prot.fasta","w")
nucAccI = open(dbName+"_"+ p +"_included.txt","w")
nucAccX = open(dbName+"_"+ p +"_excluded.txt","w")
allList = ntf.readlines()
accList = allList[round(len(allList)*(int(p)-1)/100.0):round(len(allList)*int(p)/100.0)]
ntf.close()
def addCDStoDict(cdsString,cdsDict,organism):
if len(cdsString)!=0 and cdsString[0].strip().startswith("CDS"):
currProduct = ""
currSequence = []
currAcc = ""
inTranslation=False
for cc in cdsString:
cc = cc.strip()
if not inTranslation:
if cc.startswith("/product"):
currProduct = cc.split("=")[1].strip("\"")
elif cc.startswith("/protein_id"):
currAcc = cc.split("=")[1].strip("\"")
elif cc.startswith("/translation"):
currSequence.append(cc.split("=")[1])
inTranslation = True
else:
currSequence.append(cc)
if currProduct != "":
cdsDict[(currAcc + "| " + currProduct + " ["+organism+"]")] = "".join(currSequence).strip("\"")
del cdsString[:]
def extractProducts(geneEntry):
prots = {}
currCDSstr = []
organism = "--unknown organism"
inFeatures = False
for line in geneEntry:
# control the presence in features
if not inFeatures:
if line.startswith("FEATURES"):
inFeatures = True
continue
else:
if line.strip().startswith("ORGANISM"):
organism = line.split()[1]
continue
if inFeatures:
if not line.startswith(" "):
addCDStoDict(currCDSstr,prots,organism)
break
# here we are in Features :
if(line[5] != " "):
addCDStoDict(currCDSstr,prots,organism)
currCDSstr.append(line.rstrip())
return prots
cpt = 0
for acc in accList:
acc = acc.strip().split(".")[0]
cpt+=1
goldenResult = subprocess.check_output(['golden',"-a",("refseq:" if "_" in acc else "genbank:")+acc])
if len(goldenResult) != 0:
nucAccI.write(acc + "\n")
prots = extractProducts(StringIO(goldenResult.decode("utf-8")))
for name,seq in prots.items():
of.write(">" + name + "\n" + seq + "\n")
else:
nucAccX.write(acc + "\n")
#!/usr/bin/env python3
import sys,subprocess,argparse
from io import StringIO
parser = argparse.ArgumentParser()
parser.add_argument("fasta", help="fasta sequences")
parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)")
args = parser.parse_args()
p = args.p
fastaf = open(args.fasta)
tempFasta = open("partial/part_" + p + ".fasta","w")
allList = fastaf.readlines()
totalSeq = len(allList)
firstLine = round(totalSeq*(int(p)-1)/100.0)
lastLine = round(totalSeq*int(p)/100.0)
cursor = firstLine
inSeq = False
lastSeq = False
while cursor < totalSeq:
line = allList[cursor]
if not inSeq:
if line.startswith(">"):
inSeq = True
if lastSeq:
if line.startswith(">"):
break
if cursor == lastLine:
lastSeq = True
if inSeq:
tempFasta.write(line)
cursor += 1
#!/usr/bin/env python3
import sys,subprocess,argparse,glob,os
parser = argparse.ArgumentParser()
parser.add_argument("dir", help="directory containing alignments")
parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)")
args = parser.parse_args()
p = args.p
fastas = glob.glob(args.dir +"/*.msa.faa")
log = open(p + ".log","w")
nbFiles = len(fastas)
firstFile = int(round(nbFiles*(int(p)-1)/100.0))
lastFile = int(round(nbFiles*int(p)/100.0))
for cfp in fastas[firstFile:lastFile]:
familyName = '.'.join(os.path.basename(cfp).split(".")[:-2])
outp = familyName + ".hmm"
logp = familyName + ".log"
log.write((subprocess.check_output("hmmbuild --informat afa -n %s -o %s %s %s" % (outp,logp,outp,cfp),shell=True)).decode("utf-8"))
#!/usr/bin/env python3
import sys,subprocess,argparse,glob,os
parser = argparse.ArgumentParser()
parser.add_argument("dir", help="directory containing fasta")
parser.add_argument("p", help="proceed p percent of the file (from 0 to p percent)")
args = parser.parse_args()
p = args.p
fastas = glob.glob(args.dir +"/*.fasta")
log = open(p + ".log","w")
nbFiles = len(fastas)
firstFile = int(round(nbFiles*(int(p)-1)/100.0))
lastFile = int(round(nbFiles*int(p)/100.0))
for cfp in fastas[firstFile:lastFile]:
outp = os.path.splitext(os.path.basename(cfp))[0] + ".msa.faa"
logp = os.path.splitext(os.path.basename(cfp))[0] + ".log"
log.write((subprocess.check_output("muscle -in %s -out %s -log %s" % (cfp,outp,logp),shell=True)).decode("utf-8"))
#!/usr/bin/env python3
import argparse,fastaHmmr
parser = argparse.ArgumentParser()
parser.add_argument("fastaFile", help="original set of sequences")
parser.add_argument("minSequenceLength", help="minimum sequence length")
args = parser.parse_args()
fastaHmmr.filterFASTA(args.fastaFile,int(args.minSequenceLength))
#! /bin/sh
#SBATCH --cores=4
date >> log.txt
echo "Job ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt
srun python3 /pasteur/homes/tbigot/bank/scripts/partialBlast.py /pasteur/homes/tbigot/bank/rVDB_prot/rVDB_prot2_plus_noDupes_minL1_s100.fasta ${SLURM_ARRAY_TASK_ID}
srun blastp -query partial/part_${SLURM_ARRAY_TASK_ID}.fasta -out result/part_${SLURM_ARRAY_TASK_ID}.out -db /pasteur/homes/tbigot/bank/rVDB_prot/rVDB_prot2_plus_noDupes_minL1_s100.fasta -outfmt 6 -num_threads 4
exit 0
#! /bin/sh
#SBATCH --qos=fast
#SBATCH --cores=1
date >> log.txt
echo "HMMBUILD ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt
srun python3 /pasteur/homes/tbigot/bank/scripts/partialHmmbuild.py /pasteur/homes/tbigot/bank/rVDB_prot/aligned ${SLURM_ARRAY_TASK_ID}
exit 0
#! /bin/sh
#SBATCH --qos=fast
#SBATCH --cores=1
date >> log.txt
echo "MUSCLE ${SLURM_ARRAY_JOB_ID}.${SLURM_ARRAY_TASK_ID}.">> log.txt
srun python3 /pasteur/homes/tbigot/bank/scripts/partialMuscle.py /pasteur/homes/tbigot/bank/rVDB_prot/clusters ${SLURM_ARRAY_TASK_ID}
exit 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment