diff --git a/src/lilamsxml_20080130.py~ b/src/lilamsxml_20080130.py~ deleted file mode 100644 index 46903301560552f73fd5050e4912e39ad7776a0b..0000000000000000000000000000000000000000 --- a/src/lilamsxml_20080130.py~ +++ /dev/null @@ -1,905 +0,0 @@ -#!/usr/bin/python -""" -lilams - python script for the analysis -of SILAC experiments with MALDI spectra -and MSFit protein identifications - -first version - September 2003 -this version - September 2004 -slightly revised in oct 2005 and jan 2008 - -Allows generation of XML output containing -the whole information used for quantitations -As a result, the output of this script is an -XML-like string with <pepdata>...</pepdata> -as delimiters. - -Uses HTMLparser to parse the MSFit files. While much -slower, this method is much more flexible - -GIM - Institut Pasteur -first argument - spectrum (text format) (--spectrum, -s) -second argument - Msfit file (html format) (--msfithtml, -i) -third argument - a list of ORFs that will be quantified (--orflist, -p) -fourth argument - the path to the ipc, Isotope Pattern Calculator -executable (--ipc_path, -c) as it may vary in localization - -Returns the results to the stdout with the peptide sequence and differences -between observed peak intensities and their theoretical intensities. -By default 10 peak differences from a family of peaks are returned -this takes in account 0, 1, 2 or 3 leucines -Makes use of the external IPC program for calculating -peak mass distributions, if not on the system path, please modify -the call in the RunIPC function - -Modified on 20080130 to handle the absence of a mowse score -Will report 1, if a Mowse score is present -""" -import sys, re, os, string, copy, math, getopt -from MsFitHTMLparse02 import MSFit32Parser -#from elementtree import ElementTree as ET -import cElementTree as ET -def TEST(specfileh, msfith): - return specfileh.readline() -############################################# -####### MSFit html file parser ######### -############################################# -def ParseMSFITold(msfitfilehandle): - """ - Parses an MSFit generated html identification file - - Returns a dictionary: - key = rank -> subdict key = protdata - -> subdict key = pepdata (mzh, pepseq) - needs the re module - """ - data_dict = {} - #REGULAR expressions to parse an MS-FIT identification htm file - rankRE = r"\<A NAME=\"([0-9])\"\>\<\/A\>\<!--rank--\>[0-9].+" - accesionRE = r"--accession_number--\>(.+)\<\!--spe.+" - nameRE = r"--entry_name--\>\>(.+)" - entrylineRE = rankRE + accesionRE + nameRE - entrylinePATT = re.compile(entrylineRE) - pepinmassRE = r"--in_mass--\>([0-9\.]+)\<.+" - pepseqmodRE = r"\)([A-Z]+)\(" - pepseqRE = r"sequence=([A-Z]+)\\" - peplineREmod = pepinmassRE + pepseqmodRE - peplineRE = pepinmassRE + pepseqRE - peplinePATT = re.compile(peplineRE) - peplinePATTmod = re.compile(peplineREmod) - try: - #infile = open(msfitfilename, 'r') - lines = msfitfilehandle.readlines() - - for line in lines: - entry = entrylinePATT.search(line) - if entry: - efound = entry.groups() - data_dict[efound[0]] = {"protdata":(efound[1], efound[2][:-1]), "pepdata":[]} - #strip the last character from the entry name (efound[2]) - #rank, accesion nb, entry name - else: - pepmatch = peplinePATT.search(line) - if pepmatch: - data_dict[efound[0]]["pepdata"].append([pepmatch.groups()[0],\ - pepmatch.groups()[1]]) - else: - pepmatch = peplinePATTmod.search(line) - if pepmatch: - data_dict[efound[0]]["pepdata"].append([pepmatch.groups()[0],\ - pepmatch.groups()[1]]) - finally: - #infile.close() - return data_dict - -def ParseMSFIT(msfitfilehandle): - """ - Parses an MSFit generated html identification file - - Returns a list of dictionaries with 2 entries: - dict key = protdata - with many subkeys - dict key = pepdata with subkeys 'matched' - a list - m/z - sequence, and 'unmatched' - a simple list - - """ - htmlpage = msfitfilehandle.read() - parser = MSFit32Parser() - parser.feed(htmlpage) - parser.close() - return parser.get_dictionary() - -######TESTING###### -################### -#msfitfile = sys.argv[1] -#resdict = {} -#resdict = ParseMSFIT(msfitfile) -#print resdict - -################################################### -##### STATISTICAL FUNCTIONS ################# -################################################### - -def CalcMonoMz(pepseq): - """ - Returns the m/zH+ for a peptide - - Using the monoisotopic masses of aminoacid residues calculates the - monoisotopic mass of a peptide with a given sequence. Variations - may be implemented by the use of modified aminoacid masses - """ - Daamass = {"G":57.02147, "A":71.03712,"S":87.03203, - "P":97.05277, "V":99.06842,"T":101.04768, - "C":103.00919, "carbamidomethyl":57.02146, - "carboxymethyl":58.00547, "acrylamide":71.05511, - "I":113.08407, "L":113.08407, "N":114.04293, - "D":115.02695, "Q":128.05858, "K":128.09497, - "E":129.0426, "M":131.04049, "MOxidized":147.0354, - "H":137.05891, "F":147.06842, "R":156.10112, - "Y":163.06333, "W":186.07932, "water":18.01056, "Hplus":1.007825, - "Oxygen16":15.99491} - mass = 0.0 - for i in range(len(pepseq)): - mass+=Daamass[pepseq[i]] - mass+=Daamass["water"]+Daamass["Hplus"] #add the water and charged H+ - return mass -def Mean(inlist): - """ - Obtain the mean of the values from a list - - """ - sum = 0 - for item in inlist: - sum = sum + item - if len(inlist)>0: - return sum/float(len(inlist)) - else: - return 0 -def SumSquares(inlist): - """ - Squares each value in the passed list, adds up these squares and - returns the result. - """ - ss = 0 - for item in inlist: - ss = ss + item*item - return ss - -def Variance(inlist): - """ - Calculates the variance for a sample - Returns 1 if less than 2 values are provided - """ - n = len(inlist) - mn = Mean(inlist) - ssq = SumSquares(inlist) - if n >1: - vari = (ssq - n*mn*mn)/(n-1) - else: - vari = 1 - return vari -def StdDev(inlist): - """ - Returns the standard deviation of the values in the passed list - using N-1 in the denominator (i.e., to estimate population stdev). - """ - return math.sqrt(Variance(inlist)) - -def Median(inlist): - """ - Returns the median of the passed list. - Returns 0 or the element if less than 2 elements - are passed to the function - """ - newlist = copy.deepcopy(inlist) - newlist.sort() - if len(newlist) == 0: - median = 0 - elif len(newlist) == 1: - median = newlist[0] - elif len(newlist) % 2 == 0: # if even number, simple mean of middles - index = len(newlist)/2 - median = float(newlist[index] + newlist[index-1]) /2 - else: - index = len(newlist)/2 - median = newlist[index] - return median - -############################################# -####### IO functions ##################### -############################################# -def ReadFromFile(fhandle, skip=0): - """ - Obtain data from a tab delimited file into a list - - Needs the import string statement - The <skip> parameter indicate the number of lines not - read. This version uses a filehandle for calls - - """ - - #for line in file: - # data.append(map(float,line.strip().split("\t"))) - # an iterator could be used. - - data = [] - #f=open(filename, 'r') - for i in range(skip): - fhandle.readline() - while (1): - line = fhandle.readline() - if line == "": break - #line = line[:-1] #strip the last character (the newline usually) - data.append(map(float,line.strip().split('\t'))) - #f.close() - return data - #the returned data will be in float format - - -def WriteMToFile(matrix, file_handle): - """ - Writes the content of a matrix (list of lists) to a - tab delimited text file - """ - for line in matrix: - for item in line: - file_handle.write(str(item)) - file_handle.write('\t') - file_handle.write('\n') - -def WriteMToString(matrix): - """ - Returns the content of a matrix to a tab delimited - string - """ - res = "" - for line in matrix: - for item in line: - res = res + (str(item)) + '\t' - res = res + '\n' - return res - -def WriteListToString(list, format): - res = "" - for item in list: - res = res + format %(item) + " " - return res - -############################################### -######### SPECTRA ANALYSIS functions ########## -############################################### -def GetPeak(data, peakx, lo=0, hi=41357): - """ - Obtain the x y values for the pair most closely to the peak mass - - Uses a binary search algorithm since the list is sorted - 41357 is the number of datapoints from an exported spectrum between - 700 and 3000 Da - - """ - mindif = 0.037 #it depends on the minimal distance between values - #should be larger than half that minimal distance - #hi = len(data) - #lo = 0 - dif = 10000.0 - - while (abs(dif) > mindif): - middle = int(round((hi-lo)/2 +lo)) - dif = peakx - data[middle][0] - if dif == 0: break - if dif > 0: - lo = middle - else: - hi = middle - return middle - -def CrowlToMax(peakindex, howmuch, data): - """ - Returns the real position of a maximum - - Go around to see if the peak is really at the maximum value, otherwise - leave it as it was - <howmuch> the number of values around the actual value - 10 is a good value - """ - peak_int = data[peakindex][1] - newmax = peakindex - #initialize with the value of the peak index, will be changed later - for i in range(peakindex-howmuch, peakindex+howmuch): - if peak_int <= data[i][1]: - newmax = i - peak_int = data[i][1] - return newmax - - -def GetBackground(monoindex, points, data, level=0.05): - """ - Define the background as the average of the <level>% lowest - intensity values; uses function Mean - - <monoindex> - position of the monoisotopic peak - <points> points considered - <data> list of floats m/z, intensity - <level> percent of points that might be considered in background - """ - #obtain the values - intensities = [] - for i in range(monoindex, monoindex+points): - intensities.append(data[i][1]) - #sort the list - intensities.sort() - smallestval_list = intensities[0:int(len(intensities)*level)] - return Mean(smallestval_list) - -def GetIsotopeSeries(monoindex, howmany, data): - """ - Returns a list of intensities in a series of peaks - - Starting from the monoisotopic peak obtain maximum peaks at 1 Da - distance intervals; uses function CrowlToMax - <howmany> defines the number of peaks to be retrieved - <monoindex> the mass of the firs peak in the series - <data> a list containing floats in pairs: mass/intensity - """ - # calculate average distance for a dalton in the range of monoindex, between15 and 30 values/dalton normally - distforonedalton = abs(round(100/(data[monoindex][0] - data[(monoindex+100)][0]))) - - peaklist = [] - for i in range(0, howmany): - peakindex = CrowlToMax(int(monoindex + i*distforonedalton), int(distforonedalton/3), data) - #we accept deviations of distforonedalton/3 - that is 0,33 Da - this can be modified - peakvalue = data[peakindex][1] - peakamu = data[peakindex][0] - peaklist.append([peakamu, peakvalue]) - return peaklist - - -def GetPeaksFromValue(monopeak, data, bground = 100, nbisotopes = 14, crowlnb = 5): - """ - Generates the list (amu and intensities) from a peak amu value - - returns the family of peaks (amu and intensities) starting - from a value of a peak measured in the spectra; - uses functions GetPeak, GetBackground, GetIsotopeSeries, - CrowlToMax - monopeak - the mass of the monoisotopic peak - bground - number of points around the peak to be considered - for calculating the background - nbisotopes - number of peaks returned for a family - crowlnb - number of points tested to see if a peak is a peak - - """ - resultlst = [] - #see if we may find the peak around that value - peak_pos = GetPeak(data, monopeak, hi=len(data)) - #test to see if there is no peak higher at 5 values around - real_peakpos = CrowlToMax(peak_pos, crowlnb, data) - background = GetBackground(real_peakpos, bground, data) - #print "background", background - peak_list = GetIsotopeSeries(real_peakpos, nbisotopes, data) - - for element in peak_list: - resultlst.append([element[0], element[1]-background]) - - return resultlst - -### INTERFACE WITH Isotope Pattern Calculator IPC ### -def RunIPC(peptide, howmany, monoisoint, ipc_path): - """ - Runs the external program IPC -> theoretical isotopic distribution - - Returns a list of peptide masses+H and intensities at 1 Da distance, - normalized to the intensity of the monoisoint peak. - the ipc program must be available on the shell path; needs os and - string import. - - How to test: - pep = "AADGGHGGHHILL" - hmany = 6 - rlist = [] - rlist = RunIPC(pep, hmany) - """ - HMASS = 1.007825 #to be added to the peaks for m/z calculations - #ipccommand = r"/usr/local/ipc-1.1/ipc" #correct for agathe, modify if on other machine - ipccommand = ipc_path - ipciterations = "-f 500" - ipcresolution = "-d 1" #this will sum the intensities of peaks at less than 0.1 Da apart - ipcpeptide = "-a " + peptide - - pipe_ipc = os.popen("%s %s %s %s" % (ipccommand, ipciterations, ipcresolution, ipcpeptide)) - pipe_ipc.readline() - out_list = [] - res_list = [] - for i in range(howmany): - out_list.append(string.split(pipe_ipc.readline())) - - #print out_list - for elm in out_list: - amustr = elm[1] - percentstr = elm[3] - res_list.append([float(amustr[0:-1])+HMASS, float(percentstr[0:-1])]) - #the elements 1 and 3 are the amu and the relative abundance - #the [0:-1] chops the last character (comma) - - norm_lst = [] #normalized list in function of the intensity of the first peak - for i in range(len(res_list)): - norm_lst.append([res_list[i][0], res_list[i][1]*monoisoint/res_list[0][1]]) - - return norm_lst - -def IsDifference(difference, dif, tolerated): - """ - Returns 1 if the difference is in the tolerated range around dif - - accepts an error, the "tolerated" parameter - """ - if (difference>(dif-tolerated) and difference <(dif+tolerated)): - return 1 - else: - return 0 - -def PepCompare(pepmz, pepseq): - """ - Evaluates the measured mass of a peptide to see if its deuterated or not - - Returns a dictionary containing the corrected pepmz if deuterated, - the peptide sequence (unchanged), and two flags MODIFIED if MetOX or other - has been found and IGNORE if no known modification was found. It also calculates - the number of leucines and reports it as a fifth element NBLEU. - Uses CalcMonoMz and IsDifference. - """ - HPLUS = 1.007825 - tolerated = 0.3 #daltons - theor_mz = CalcMonoMz(pepseq) - modif_dict = {"MetOX":15.9949} - deut_dict = {"NotD": 0.0, "1LeuD3": 3.023, "2LeuD3" : 6.047, "3LeuD3": 9.07, "4LeuD3": 12.094} - - #test if the peptide is the deuterated form (up to 4 Leu) - diff_mz = pepmz - theor_mz - MODIFIED = 0 - IGNORE = 0 - NBLEU = pepseq.count('L') - - FOUND = 0 - for mod in modif_dict.keys(): - #print mod, modif_dict[mod], "modif" - #print deut_dict.keys() - for dif in deut_dict.keys(): - #print dif, deut_dict[dif], "dif" - if IsDifference(diff_mz, deut_dict[dif], tolerated): - #print dif, "found" - pepmz = pepmz - deut_dict[dif] - FOUND = 1 - break - elif IsDifference(diff_mz, deut_dict[dif]+modif_dict[mod], tolerated): - #print dif, "found and modification", mod - pepmz = pepmz - deut_dict[dif] - MODIFIED = 1 - FOUND = 1 - break - if not FOUND: - #none of the tested modifications or deuteration accounted for the mass difference - IGNORE = 1 - #print pepseq, "ignored" - return {"pepmz":pepmz, "pepseq":pepseq, "MODIFIED":MODIFIED, "IGNORE":IGNORE, "NBLEU":NBLEU} - -def CalcAndGet(pepmz, pepseq, data, peaks, ipc_path): - """ - Calculates the theoretical intensities of the first 4 peaks - for peptides with 1 Leucine, the pepmz corresponds to the nondeuterated form. - - This function is called only when the NBLEU of a peptide is 1 - Returns a list of differences between the measured and theoretical - intensities for the peaks normalized to the monoisotopic peak. - Uses most of the functions previously defined by calls to - GetPeaksFromValue and RunIPC. The first elements of the returned - list are the peptide sequence, mz and the intensity of the monoisotopic peak - """ - theor_mz = CalcMonoMz(pepseq) - mz_difference = pepmz - theor_mz - measured_lst = GetPeaksFromValue(pepmz, data, nbisotopes=peaks) - #print measured_lst, "measured_lst" - #recover the peaks on a range of 6 daltons from the monoisotopic peak - monointensity = measured_lst[0][1] - if pepseq.count("L") == 1: - theor_lst = RunIPC(pepseq, 6, monointensity, ipc_path) - #obtain the calculated list of pairs (mz, Intensity) based on the monointensity value given - - dif_lstint= [pepseq] - dif_lstint.append(pepmz) - #dif_lstint.append(monointensity) - - for i in range(len(measured_lst)): - #we only substract from the measured values the peaks 4, 5, 6 - if pepseq.count("L") == 1: - if i < 3 or i > 5: - dif_lstint.append(measured_lst[i][1]) - else: - dif_lstint.append(measured_lst[i][1]-theor_lst[i][1]) - else: - dif_lstint.append(measured_lst[i][1]) - - #print dif_lstint, "dif_lstint" - - return dif_lstint - -def MzIntGet(data, pepmz, plusminus=15): - """ - Obtain the list of mz-int pairs - from the spectrum data - - Returns tuple of two lists - """ - mzdata, intdata = [], [] - - peakindex = GetPeak(data, float(pepmz), hi=len(data)) - distforonedalton = abs(round(100/(data[peakindex][0] - data[(peakindex+100)][0]))) - minindex = int(peakindex - distforonedalton*plusminus) - maxindex = int(peakindex + distforonedalton*plusminus + 6*distforonedalton) - for i in range(minindex, maxindex): - mzdata.append(data[i][0]) - intdata.append(data[i][1]) - return(mzdata, intdata) - -################################## -#### Processing data structure ### -################################## -class DataIntObj: - """ - A data structure to handle silac - difference results and calculate - statistics on them - - """ - def __init__(self): - self.nmbentries = 0 - self.data = {} - self.results = {} - self.statistics = {} - #dictionaries with unique keys formed by pepseq+mzh - self.deleted = [] - - - def get_data(self , matrix): - """ - transforms the matrix obtained by several - calls to DiffMeasTheor into internal data - """ - for element in matrix: - listdiff = [] - for i in range(2, len(element)-1): - #first element is pepseq, second element modif pepmz - listdiff.append(float(element[i])) - intdict = {"pepseq":element[0], "mzh":float(element[1]), - "calcmzh":CalcMonoMz(element[0]), - "int0":float(element[2]), - "BADPEAKS": 0, - "listdiff":listdiff} - intdict["massdiff"]=intdict["mzh"]-intdict["calcmzh"] - intdict["Lnumber"]=intdict["pepseq"].count('L') - self.data[element[0]+'_'+str(element[1])] = intdict - return self.data - - def filter_peaks(self): - """ - Eliminates from self.data the peaks for which - the peak 3 or multiple is higher than 2 or multiple - Decides whether to take the non-deuterated or the - deuterated form in function of the intensity of the - corresponding peaks - """ - entrieslist = self.data.keys() - for entrykey in entrieslist: - #print self.data[entrykey] - peak2, peak3, peakdx2, peakdx3 = 0.0, 0.0, 0.0, 0.0 - nbleu = self.data[entrykey]["Lnumber"] - if nbleu <= 4: - #treat maximum 4 leucines - if len(self.data[entrykey]["listdiff"]) > 13: - peak2 = self.data[entrykey]["listdiff"][1] - peak3 = self.data[entrykey]["listdiff"][2] - peakdx2 = self.data[entrykey]["listdiff"][nbleu*3+1] - peakdx3 = self.data[entrykey]["listdiff"][nbleu*3+2] - #print peak2, peak3, peakdx2, peakdx3, "peaks" - if (peak2+peak3) > (peakdx2+peakdx3): - #work only with the highest value pair - if peak3 > peak2: - del self.data[entrykey] - self.deleted.append(entrykey) - #print " <br> Considered unreliable", entrykey - else: - if peakdx3 > peakdx2: - del self.data[entrykey] - self.deleted.append(entrykey) - #print " <br> Considered unreliable", entrykey - else: - del self.data[entrykey] - #print " <br> List too short", entrykey - else: - del self.data[entrykey] - #print " <br> More than 4 leucines", entrykey - return self.data - - def calc_ratios(self): - """ - Calculates the ratios deut/nondeuterated peptides - in a mix based on peptide sequence (0, 1-4 leucines) - """ - entrieslist = self.data.keys() - - for key in entrieslist: - nbleu = self.data[key]["Lnumber"] - monoisoint = self.data[key]["listdiff"][0] - if nbleu <> 0 and monoisoint <> 0: - self.results[key] = self.data[key]["listdiff"][nbleu*3]/monoisoint - else: - self.results[key] = 0.0 - return self.results - - - def calc_statistics(self): - """ - Median, average and StdDev for the fractions calculated for - peptides with 1, 2, 3 or 4 leucines - """ - frlist = [] - entrieslist = self.data.keys() - for key in entrieslist: - nbleu = self.data[key]["Lnumber"] - if (nbleu > 0 and nbleu < 5): - frlist.append(self.results[key]) - self.statistics["Median"]=Median(frlist) - self.statistics["Mean"]=Mean(frlist) - self.statistics["StdDev"]=StdDev(frlist) - self.statistics["Nbvalues"]=len(frlist) - #print self.statistics - - return self.statistics - - - def return_resultsXML(self, spectr_dict, record): - """ - The object contains all the required information - the results are returned in very simple XML-like format - """ - #res = [["Pepseq", "mz", "mzcalc", "Leucines", "MonoisoIntensity", "DeuteratedIntensity", - #"Deuterated_fraction"]] - #Add primary data - root= ET.Element("protdata") - idroot = ET.SubElement(root, "id_parameters") - comm = ET.SubElement(idroot, "spot_id") - comm.text = record["protdata"]["comment"] - dbase = ET.SubElement(idroot, "database") - dbase.text = record["protdata"]["database"] - ppm = ET.SubElement(idroot, "ppm_tolerance") - ppm.text = record["protdata"]["parent_mass_tolerance"] - mowse = ET.SubElement(idroot, "mowse_score") - try: - mowse.text = record["protdata"]["mowse_score"] - except: - mowse.text = "1.0" - protcovg = ET.SubElement(idroot, "pep_coverage") - protcovg.text = record["protdata"]["protein_coverage"] - matchedpep = ET.SubElement(idroot, "no_matched_pept") - matchedpep.text = str(len(record["pepdata"]["matched"])) - accnb = ET.SubElement(root, "accession_number") - accnb.text = record["protdata"]["accession_number"] - enname = ET.SubElement(root, "entry_name") - enname.text = record["protdata"]["entry_name"] - protmw = ET.SubElement(root, "prot_mw") - protmw.text = record["protdata"]["protein_mw"] - protpi = ET.SubElement(root, "prot_pi") - protpi.text = record["protdata"]["protein_pI"] - - pepdataroot = ET.SubElement(root, "pepdata") - - for peprec in self.data.keys(): - nbleu = self.data[peprec]["Lnumber"] - if nbleu in (1,2,3,4): - deuterated = self.data[peprec]["listdiff"][nbleu*3] - else: - deuterated = 0.0 - - pepentry = ET.SubElement(pepdataroot, "measured") - pepseq = ET.SubElement(pepentry, "pepseq") - pepseq.text = self.data[peprec]["pepseq"] - mzh = ET.SubElement(pepentry, "mzh") - mzh.text = "%.4f" %(self.data[peprec]["mzh"]) - calcmzh = ET.SubElement(pepentry, "calc_mzh") - calcmzh.text = "%.4f" %(self.data[peprec]["calcmzh"]) - Lnumber = ET.SubElement(pepentry, "leucines") - Lnumber.text = str(self.data[peprec]["Lnumber"]) - int0 = ET.SubElement(pepentry, "mono_iso_intensity") - int0.text = "%.0f" %(self.data[peprec]["int0"]) - intd = ET.SubElement(pepentry, "deuterated_intensity") - intd.text = "%.0f" %(deuterated) - ratio = ET.SubElement(pepentry, "ratio") - ratio.text = "%.2f" %(self.results[peprec]) - validated = ET.SubElement(pepentry, "validated") - validated.text = "UNKNOWN" - #lookedat = ET.SubElement(pepentry, "looked_at") - #lookedat.text = "NO" - local_bg = ET.SubElement(pepentry, "local_bg") - local_bg.text = "0.0000" - corr_ratio = ET.SubElement(pepentry, "corrected_ratio") - corr_ratio.text = "%.2f" %(self.results[peprec]) - spec_piece = ET.SubElement(pepentry, "spectrum") - mzspec = ET.SubElement(spec_piece, "mz_list") - mzspec.text = WriteListToString(spectr_dict[self.data[peprec]["pepseq"]][0], "%.4f") - intspec = ET.SubElement(spec_piece, "int_list") - intspec.text = WriteListToString(spectr_dict[self.data[peprec]["pepseq"]][1], "%.4f") - - #~ res.append([self.data[peprec]["pepseq"], "%.4f" %(self.data[peprec]["mzh"]), - #~ "%.4f" %(self.data[peprec]["calcmzh"]), self.data[peprec]["Lnumber"], - #~ "%.0f" %(self.data[peprec]["int0"]), "%.0f" %(deuterated), - #~ "%.2f" %(self.results[peprec])]) - - return ET.tostring(root) - - - def return_results(self): - """ - The object contains all the required information - it is returned as a list of lists - """ - res = [["Pepseq", "mz", "mzcalc", "Leucines", "MonoisoIntensity", "DeuteratedIntensity", - "Deuterated_fraction"]] - #Add primary data - for peprec in self.data.keys(): - nbleu = self.data[peprec]["Lnumber"] - if nbleu in (1,2,3): - deuterated = self.data[peprec]["listdiff"][nbleu*3] - else: - deuterated = 0.0 - res.append([self.data[peprec]["pepseq"], "%.4f" %(self.data[peprec]["mzh"]), - "%.4f" %(self.data[peprec]["calcmzh"]), self.data[peprec]["Lnumber"], - "%.0f" %(self.data[peprec]["int0"]), "%.0f" %(deuterated), - "%.2f" %(self.results[peprec])]) - - res.append(["Nbofvalues", "Median", "Mean", "StdDev"]) - res.append(["%s" %(self.statistics["Nbvalues"]), - "%.2f" %(self.statistics["Median"]), - "%.2f" %(self.statistics["Mean"]), - "%.4f" %(self.statistics["StdDev"])]) - return res - - -####################### -#####calls from CGI#### -####################### -def ProcessSpectrum(spectrahandle, msfithandle, orflisthandle, ipc_path): - """ - A function to simplify the aspect of Main - uses handles to file or file-like objects - Returns the results in list/string form - """ - global calc_res - calc_res = "" - #insert data from the spectra in a list of floats - data_num = ReadFromFile(spectrahandle, 2) - data_num_minmz = data_num[0][0] - data_num_maxmz = data_num[-1][0] - #parse the HTML file - parseddict = ParseMSFIT(msfithandle) - #print "Parsing done" - #parseddictkeys = parseddict.keys() #####modified - #parseddictkeys.sort() #####modified - #pay attention - sorted by string not by integer value - - #parse the orflist, the list of interesting things to be quantified - orflist = [] - #the next lines are only necessary if the entries were not - #well formated in the database. To be modified otherwise. - ############################################### - for line in orflisthandle: - orflist.append(line.strip()) - #print 'orflist 0', orflist[0] - ############################################### - - record = parseddict[0] #take only the first record - proteinrank = record['protdata']['rank'] - if record['protdata']['accession_number'] in orflist: - - peplist = record['pepdata']['matched'] - #the list of pairs mzh/pepseq for a given protein - result_matrix = [] - spectrum_dict = {} - plusminus = 25 - #the number of daltons around the monoisotopic peak to be retrieved - for peptide in peplist: - #print "Working on ", peptide[1] - difflist = [] - peptidesequence = peptide[1] - inputpepmz = float(peptide[0]) - decision_dict = PepCompare(inputpepmz, peptidesequence) - #if the peptide entry is a deuterated peptide or if its mass is already in the - #result_matrix (DUPLICATED = 1), no calculation is performed - - if decision_dict["IGNORE"] == 0: - DUPLICATED = 0 - for i in range(len(result_matrix)): - if abs(result_matrix[i][1]-decision_dict["pepmz"]) < 0.5: - #look for pepmz similar in the range +-0.5 Da to the one tested. - #If it exists do not run CalcAndGet - DUPLICATED = 1 - break - if DUPLICATED == 0: - #test also if the range of mass from the spectrum is OK, otherwise stop - if decision_dict["pepmz"] > data_num_minmz and\ - decision_dict["pepmz"] < data_num_maxmz: - difflist = CalcAndGet(decision_dict["pepmz"], \ - decision_dict["pepseq"], data_num, 16, ipc_path) - result_matrix.append(difflist) - spectrum_dict[decision_dict["pepseq"]] = MzIntGet(data_num, - decision_dict["pepmz"], plusminus) - - rawdiff= DataIntObj() - rawdiff.get_data(result_matrix) - rawdiff.filter_peaks() - rawdiff.calc_ratios() - pepresults = rawdiff.return_resultsXML(spectrum_dict, record) - else: - pepresults = "nodata" - #~ res = ET.Element("pepdata") - #~ ET.SubElement(res, "nodata") - #~ pepresults = ET.tostring(res) - - return pepresults - -#################### -####### MAIN ####### -#################### -class Usage(Exception): - """ - used for catching exceptions generated - during command line parameter parsing - """ - def __init__(self, msg): - self.msg = msg -def main(argv=None): - global fname_spectra, fname_msfit, fname_orflist, ipc_command_path - global spectrumhandle, msfithandle, orflisthandle - spectrumhandle, msfithandle, orflisthandle = sys.stdin, sys.stdin, sys.stdin - - if argv is None: - argv = sys.argv - try: - try: - opts, args = getopt.getopt(argv[1:], "hs:i:p:c:", ["help", "spectrum=", - "msfithtml=", 'orflist=', "ipc_path="]) - except getopt.error, msg: - raise Usage(msg) - # option processing - - fname_spectra, fname_msfit = "", "" - for option, value in opts: - if option in ("-h", "--help"): - raise Usage(__doc__) - if option in ("-s", "--spectrum"): - fname_spectra = value - if option in ("-i", "--msfithtml"): - fname_msfit = value - if option in ("-p", "--orflist"): - fname_orflist = value - if option in ("-c", "--ipc_path"): - ipc_command_path = value - - if (fname_spectra == "" or fname_msfit == "" or fname_orflist == ""): - raise Usage("no input filenames!") - sys.exit(2) - else: - try: - spectrumhandle = open(fname_spectra, "r") - msfithandle = open(fname_msfit, "r") - orflisthandle = open(fname_orflist, "r") - #print "OK", fname_spectra, fname_msfit, fname_orflist - print ProcessSpectrum(spectrumhandle, msfithandle, orflisthandle) - finally: - spectrumhandle.close() - msfithandle.close() - orflisthandle.close() - except Usage, err: - print >>sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg) - print >>sys.stderr, "\t for help use --help or -h" - return 2 - -if __name__ == "__main__": - sys.exit(main())