#!/usr/bin/env python from collections import defaultdict import os from shutil import copyfile import pandas as pd import numpy as np from Bio import SeqIO import os.path SOFTWARES = {'meme','dreme','centrimo','meme_tomtom'} ##TYPES_SEARCHES = {'All','Narrow','Const','Max'} def parse_all_logs(path_analysis, exp_design_name): with open(path_analysis + 'Motif_search.log', 'w') as general_log: header = 'Data_name\t' + '\t'.join(SOFTWARES) general_log.write(header+'\n') for data_name in os.listdir(path_analysis + 'results' + os.sep): if not data_name.startswith('.') and not data_name.endswith('.log') and not data_name.endswith('.sh'): progress_filename = path_analysis + 'results' + os.sep + data_name + os.sep + 'progress_log.txt' line = data_name + '\t' for software in SOFTWARES: found = False with open(progress_filename,'r') as file: for row in file: status_soft = 'name: '+software+' status: 0' if status_soft in row: found = True if found: line += '1\t' else: line += '0\t' general_log.write(line.strip() + '\n') def parse_dreme(path_analysis, path_motif, exp_design_name): with open(path_analysis + 'dreme.sh', 'w') as dreme_file: for data_name in os.listdir(path_analysis + 'results' + os.sep): if exp_design_name in data_name: print(data_name) if not data_name.startswith('.') and not data_name.endswith('.log') and not data_name.endswith('.sh'): # print(data_name) dreme_xml = path_analysis + 'results' + os.sep + data_name + os.sep + 'dreme_out' + \ os.sep + 'dreme.xml' if os.path.exists(dreme_xml): dreme_list = list() with open(dreme_xml, 'r') as file: for row in file: dreme_list.append(row.strip()) if '<command_line>' in row: dreme_file.write(row) dreme_txt = path_analysis + 'results' + os.sep + data_name + os.sep + 'dreme_out' \ + os.sep + 'dreme.txt' dreme_txt_list = list() with open(dreme_txt, 'r') as file: for row in file: dreme_txt_list.append(row.strip()) dreme_txt_intro = '' write_intro = False for row in dreme_txt_list: if row.startswith('MOTIF '): break if 'MEME version' in row: write_intro = True if write_intro: dreme_txt_intro += row + '\n' #print(dreme_txt_intro) # parse dreme with open(path_motif + os.sep + 'logs' + os.sep + 'Dreme_' + data_name + '.log', 'w') as general_log: header = 'ID\tMotif\tNb_sites\tPos_Occurence\tNeg_Occurence\tNb_Seq\t' \ 'Percent\tPos_Percent\tNeg_Percent\tEvalue' general_log.write(header + '\n') # search length fasta for line in dreme_list: if '<positives name=' in line: number_seq = float(line.split(' ')[2].replace('\"','').replace('count=','')) # search motifs for i in range(1, len(dreme_list)): line = dreme_list[i] if '<motifs>' in line: index_motifs = i break # get motifs info for k in range(index_motifs+1, len(dreme_list)): if '<motif' in dreme_list[k]: new_line = dreme_list[k].replace('\"','').split(' ') #print(new_line) id = new_line[1].replace('id=', '') #print(id,data_name) #print(dreme_txt_intro) seq = new_line[2].replace('seq=', '') occ = float(new_line[4].replace('nsites=', '')) pos_occ = float(new_line[5].replace('p=', '')) neg_occ = float(new_line[6].replace('n=', '')) evalue = new_line[8].replace('evalue=', '') # save table line_to_write = [id, seq, str(occ), str(pos_occ),str(neg_occ), str(number_seq), str(occ/number_seq), str(pos_occ/number_seq), str(neg_occ/number_seq), str(evalue)] general_log.write('\t'.join(line_to_write) + '\n') # copy png file for imagefile in os.listdir(path_analysis + 'results/' + data_name + '/dreme_out/'): if id in imagefile: if 'nc_' in imagefile: src = path_analysis + 'results'+ os.sep + data_name + os.sep + \ 'dreme_out' + os.sep + imagefile dst = path_motif + os.sep + 'motif_figure' + os.sep + seq + '_nc.png' copyfile(src, dst) else: src = path_analysis + 'results' + os.sep + data_name + \ os.sep + 'dreme_out' + os.sep + imagefile dst = path_motif + os.sep + 'motif_figure' + os.sep + seq + '_rc.png' copyfile(src, dst) # save motif with open(path_motif + os.sep + 'motif' + os.sep + seq + '.meme', 'w') as meme_file: meme_file.write(dreme_txt_intro+'\n') # find motif info dreme_txt_motif = '' write_motif = False for row in dreme_txt_list: if write_motif and row.startswith('MOTIF '): break if ('MOTIF '+seq+' DREME') in row: write_motif = True if write_motif: dreme_txt_motif += row + '\n' #print(dreme_txt_motif) meme_file.write(dreme_txt_motif + '\n') def regroup_motifs(path_motif, exp_design_name): motif_set = set() dict_dataset = defaultdict(list) dict_evalue = defaultdict(list) for data_name in os.listdir(path_motif + 'logs' + os.sep): if data_name.startswith('Dreme_') and data_name.endswith('.log') and "-" in data_name :# exp_design_name in data_name: type_data = data_name.replace('Dreme_','').replace('.log','') print(type_data, data_name) with open(path_motif + 'logs/' + data_name,'r') as log_file: log_file.readline() for line in log_file: motif = line.split('\t')[1] evalue = line.strip().split('\t')[-1] motif_set.add(motif) print(evalue,motif) dict_dataset[motif].append(type_data) dict_evalue[motif].append(evalue) with open(path_motif + 'Motif_'+exp_design_name+'_Diff_raw.txt','w') as motif_log: motif_log.write("Motif\tData\tP-value\n") for motif in motif_set: motif_log.write(motif+'\t'+';'.join(dict_dataset[motif])+'\t'+';'.join(dict_evalue[motif])+'\n') def select_centrally_enriched(path_analysis, path_motif, exp_design_name, evalue_cutoff): centrimo_motif = dict() for data_name in os.listdir(path_motif + 'logs/'): if data_name.startswith('Dreme_') and data_name.endswith('.log') and "-" in data_name :# exp_design_name in data_name: type_data = data_name.replace('Dreme_','').replace('.log','') centrimo_path = path_analysis + '/results/' + type_data + '/centrimo_out/centrimo.txt' if os.path.exists(centrimo_path): with open(centrimo_path, 'r') as centrimo_file: for line in centrimo_file: if 'DREME' in line: motif = line.split('\t')[1].split(' ')[0] print(motif, type_data) centrimo_motif[motif] = type_data print(len(centrimo_motif)) with open(path_motif + 'Motif_' + exp_design_name + '_Diff_raw.txt', 'r') as motif_log, \ open(path_motif + 'Motif_' + exp_design_name + '_Diff.txt', 'w') as motif_filter_log: motif_filter_log.write("Motif\tData\tP-value\tCentrally Enriched\n") motif_log.readline() for line in motif_log: print(line.split('\t')[0]) if line.split('\t')[0] in centrimo_motif: evalue_list = line.split('\t')[2].split(';') min_evalue = 1 for evalue in evalue_list: if float(evalue)<min_evalue: min_evalue = float(evalue) if min_evalue < evalue_cutoff: motif_filter_log.write(line.strip()+'\tYes\n') def create_motif(path_motif, motif): motif_file = path_motif + '/motif/' + motif + '.meme' motif_figure_file = path_motif + '/motif_figure/' + motif + '_nc.eps' # motif_figure_file = path_motif + '/motif_figure/' + motif + '_nc.png' # Meme-suite as to be installed for using = iupac2meme iupac2_command = 'iupac2meme -dna ' + motif + ' > ' + motif_file print(iupac2_command) os.system(iupac2_command) ceqlogo_command = 'ceqlogo -i ' + motif_file + ' -m ' + motif + ' -o ' + motif_figure_file + ' -f eps' # ceqlogo_command = 'ceqlogo -i ' + motif_file + ' -m ' + motif + ' -o '+motif_figure_file+' -f png' print('Run ' + ceqlogo_command) os.system(ceqlogo_command) def regroup_figures(path_motif, motif_list): ''' Create SVG file with all motifs included :param path_motif: :param motif_list: :return: ''' print('Create PNG file with all motifs') df_summary = pd.read_csv(path_motif + motif_list + '.txt', sep='\t') #df_summary = df_summary.sort_values(by=['Motif'], ascending=[True]) svg_text = '<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!-- Generator: Adobe Illustrator 15.0.0, SVG Export ' \ 'Plug-In . SVG Version: 6.00 Build 0) -->\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\" \"' \ 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<svg version=\"1.1\" id=\"Calque_1\" ' \ 'xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" x=\"0px\" y=\"0px\"' \ ' width=\"1300px\" height=\"'+str(len(df_summary.index)*100)+'px\" viewBox=\"0 0 1300 '\ +str(len(df_summary.index)*100)+'\" xml:space=\"preserve\">\n' i=0 for index, row in df_summary.iterrows(): motif = row['Motif'] data = row['Data'] pvalue = row['P-value'] #comment = row['Comment'] image_file = 'motif_figure/' + motif + '_nc.png' new_row = '<image overflow=\"visible\" width=\"299\" height=\"176\" xlink:href=\"'+image_file+'\" ' \ 'transform=\"matrix(0.5184 0 0 0.5184 10.00 '+ str(float(100*i))+')\">\n</image>\n' image_file = 'motif_figure/' + motif + '_rc.png' new_row += '<image overflow=\"visible\" width=\"299\" height=\"176\" xlink:href=\"' + image_file + '\" ' \ 'transform=\"matrix(0.5184 0 0 0.5184 170.00 '+ str(float(100*i))+')\">\n</image>\n' new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 340 '+ str(float(50+100*i))+')\" font-family=' \ '\"\'MyriadPro-Regular\'\" font-size=\"40\">'+data+'</text>\n' new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 800 '+ str(float(50+100*i))+')\" font-family=' \ '\"\'MyriadPro-Regular\'\" font-size=\"40\">'+pvalue+'</text>\n' #new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 1000 ' + str(float(50 + 100 * i)) + ')\" font-family=' \ # '\"\'MyriadPro-Regular\'\" font-size=\"40\">' + comment + '</text>\n' svg_text += new_row i+=1 svg_text += '</svg>\n' with open(path_motif + motif_list + '.svg', 'w') as figure_file: figure_file.write(svg_text) print('Convert svg to png') os.system('convert '+path_motif + motif_list + '.svg '+path_motif + motif_list + '.png') def run_fimo(path_analysis, exp_design_name, path_motif, path_motif_list, fimo_file): ''' Creates a fimo.sh file with all the command to run :param path_analysis: path where the fasta files, background, and fimo results should be :param exp_design_name: name of the peaks : example: CecAm_Raw_1 :param path_motif: path where the motif will be found :param motif_list_filename: path of the motif list :return: save file to path_motif + 'Fimo.sh' ''' motif_list = list() with open(path_motif_list + '.txt', 'r') as motif_file: motif_file.readline() for row in motif_file: motif_list.append(row.split('\t')[0].strip()) data_name = exp_design_name with open(fimo_file, 'w') as motif_sh: for motif in motif_list: motif_file = path_motif + 'motif/' + motif + '.meme' print(motif_file) fasta_file = path_analysis + 'fasta/' + data_name + '.fasta' background_file = path_analysis + 'results/' + data_name + '/background' folder_fimo = path_motif + 'fimo/' + motif + '_' + data_name fimo_sh = 'fimo --parse-genomic-coord --verbosity 1 --thresh 1e-2 --max-stored-scores 1000000 ' \ '--oc ' + folder_fimo + ' --bgfile ' + background_file + ' ' + motif_file + ' ' + fasta_file print(fimo_sh) motif_sh.write(fimo_sh+'\n') def run_centrimo(path_analysis, exp_design_name, path_motif, motif_list_filename): ''' Creates a centrimo.sh file with all the command to run :param path_analysis: path where the fasta files, background, and fimo results should be :param exp_design_name: name of the peaks : example: CecAm_Raw_1 :param path_motif: path where the motif will be found :param motif_list_filename: path of the motif list :return: save file to path_motif + 'Centrimo.sh' ''' motif_list = list() with open(path_motif + motif_list_filename + '.txt', 'r') as motif_file: motif_file.readline() for row in motif_file: motif_list.append(row.split('\t')[0].strip()) data_name = exp_design_name with open(path_motif + 'Centrimo.sh', 'w') as motif_sh: for motif in motif_list: motif_file = path_analysis + 'motif/' + motif + '.meme' #print(motif_file) fasta_file = path_analysis + 'fasta/' + data_name + '.fasta' background_file = path_analysis + 'results/' + data_name + '/background' folder_centrimo = path_analysis + 'centrimo/' + motif + '_' + data_name centrimo_sh = 'centrimo -seqlen 150 -verbosity 1 -oc ' + folder_centrimo +\ ' --bgfile ' + background_file + ' -score 5.0 '\ '-ethresh 10.0 ' + fasta_file + ' '+motif_file print(centrimo_sh) motif_sh.write(centrimo_sh + '\n') def create_motif_table(exp_design_name, path_motif, path_motif_list): ''' Creates two tables summarizing occurence of motif for each peak :param path_analysis: path where the fasta files, background, and fimo results should be :param exp_design_name: name of the peaks : example: CecAm_Raw_1 :param path_motif: path where the motif will be found :param motif_list_filename: path of the motif list :return: save file to path_analysis + motif_list_filename + '_score.txt' and path_analysis + motif_list_filename + '_count.txt' ''' motif_list = list() with open(path_motif_list + '.txt', 'r') as motif_file: motif_file.readline() for row in motif_file: motif_list.append(row.split('\t')[0].strip()) data_name = exp_design_name fasta_to_length = dict() print(path_analysis + 'fasta/' + exp_design_name + '.fasta' ) fasta_seq = SeqIO.to_dict(SeqIO.parse(path_analysis + 'fasta/' + exp_design_name + '.fasta' , "fasta")) nb_peaks = len(fasta_seq.keys()) np_motif = np.zeros((nb_peaks, len(motif_list))) df_motif_score = pd.DataFrame(np_motif, index=fasta_seq.keys(), columns=motif_list) np_motif = np.zeros((nb_peaks, len(motif_list))) df_motif_count = pd.DataFrame(np_motif, index=fasta_seq.keys(), columns=motif_list) np_motif = np.ones((nb_peaks, len(motif_list))) df_motif_pvalue = pd.DataFrame(np_motif, index=fasta_seq.keys(), columns=motif_list) for motif in motif_list: print(motif) df_motif_peaks = pd.read_csv(path_motif + 'fimo/' + motif + '_' + data_name + '/fimo.txt', index_col=0, sep='\t') grouped = df_motif_peaks[['sequence name','score']].groupby('sequence name') score = grouped.sum() df_motif_score[motif] = score counted = grouped.count() df_motif_count[motif] = counted # df_motif_pvalue[motif][peak] += pvalue df_motif_score.to_csv(path_motif_list + '_score.txt', sep='\t') df_motif_count.to_csv(path_motif_list + '_count.txt', sep='\t') #df_motif_pvalue.to_csv(path_motif + exp_design_name + '_' + motif_list_filename + '_pvalue.txt', sep='\t') def get_size_fasta(path_analysis, exp_design_name): with open(path_analysis + 'Fasta_Summary.txt', 'w') as fasta_summary: for data_name in os.listdir(path_analysis+'fasta/'): if data_name.endswith('.fasta'): print(data_name) fasta_seq = SeqIO.to_dict(SeqIO.parse(path_analysis+'fasta/'+data_name, "fasta")) print(data_name,len(fasta_seq)) fasta_summary.write(data_name.replace('.fasta','')+'\t'+str(len(fasta_seq))+'\n') def motif_vs_fasta(path_analysis, exp_design_name, path_motif, motif_list_filename): ''' Creates two tables summarizing occurence of motif for each fasta file :param path_analysis: path where the fasta files, background, and fimo results should be :param exp_design_name: name of the peaks : example: CecAm_Raw_1 :param path_motif: path where the motif will be found :param motif_list_filename: path of the motif list :return: save file to path_analysis + motif_list_filename + '_score.txt' and path_analysis + motif_list_filename + '_count.txt' ''' # read fasta summary fasta_to_size = dict() with open(path_analysis + 'Fasta_Summary.txt', 'r') as fasta_summary: for row in fasta_summary: fasta_name = row.split('\t')[0] fasta_size = row.split('\t')[1].strip() if fasta_size != '0': fasta_to_size[fasta_name] = fasta_size # read list motif_list = list() with open(path_motif + motif_list_filename + '.txt', 'r') as motif_file: motif_file.readline() for row in motif_file: motif_list.append(row.split('\t')[0].strip()) # read motif summary df_motif_score = pd.read_csv(path_analysis + motif_list_filename + '_score.txt', index_col=0, sep='\t') df_motif_count = pd.read_csv(path_analysis + motif_list_filename + '_count.txt', index_col=0, sep='\t') # create motif vs fasta tables nb_fasta = len(fasta_to_size.keys()) np_fasta = np.zeros((len(motif_list), nb_fasta)) df_motif_fasta_score = pd.DataFrame(np_fasta, index=motif_list, columns=fasta_to_size.keys()) np_fasta = np.zeros((len(motif_list), nb_fasta)) df_motif_fasta_count = pd.DataFrame(np_fasta, index=motif_list, columns=fasta_to_size.keys()) print(motif_list) # go through all fasta file for fasta_name, fasta_size in fasta_to_size.items(): fasta_peaks = SeqIO.to_dict(SeqIO.parse(path_analysis+'fasta/'+fasta_name+'.fasta', 'fasta')) #print(fasta_peaks.keys()) for motif in motif_list: motif_score = df_motif_score.loc[fasta_peaks.keys(), motif].sum() df_motif_fasta_score[fasta_name][motif] = float(motif_score)/float(fasta_size) motif_count = df_motif_count.loc[fasta_peaks.keys(), motif].sum() df_motif_fasta_count[fasta_name][motif] = float(motif_count)/float(fasta_size) df_motif_fasta_score.to_csv(path_analysis + motif_list_filename + '_fasta_score.txt', sep='\t') df_motif_fasta_count.to_csv(path_analysis + motif_list_filename + '_fasta_count.txt', sep='\t') #exp_design_name = 'LivOld' #exp_design_name = 'LiverZT_MaxValues' #exp_design_name = 'Liver_all_MaxMaxValues_1' #exp_design_name = 'Liver_all_MaxMaxValues_3' exp_design_name = 'Cecum_all_withoutabxGF_MaxMaxValues_3' #exp_design_name = 'Cecum_all_MaxMaxValues_3' #exp_design_name = 'LivOld_MaxValues' #exp_design_name = 'CecAm_MaxValues' pathTars = '/pasteur/projets/policy01/m6aAkker/' #path = '/Volumes/m6aAkker/' path = 'Z:' + os.sep path_analysis = path + 'PeakDiffExpression' + os.sep + exp_design_name + os.sep + 'Motif' + os.sep #path_motif = path_analysis path_motif = path + 'PeakDiffExpression' + os.sep + 'Motif' + os.sep #motif_list_filename = 'Motif_'+exp_design_name+'_count' motif_list_filename = 'Motif_' + exp_design_name path_motif_list = path_analysis + motif_list_filename # Create all folders for motif analysis # if not os.path.isdir(path_motif + 'logs' + os.sep): # os.mkdir(path_motif + 'logs' + os.sep) # if not os.path.isdir(path_motif + 'motif' + os.sep): # os.mkdir(path_motif + 'motif' + os.sep) # if not os.path.isdir(path_motif + 'motif_figure' + os.sep): # os.mkdir(path_motif + 'motif_figure' + os.sep) # if not os.path.isdir(path_motif + 'fimo' + os.sep): # os.mkdir(path_motif + 'fimo' + os.sep) # if not os.path.isdir(path_motif + 'centrimo' + os.sep): # os.mkdir(path_motif + 'centrimo' + os.sep) # first look at log to see which software crashed #parse_all_logs(path_analysis, exp_design_name) # Parse meme results and extract all motifs #parse_dreme(path_analysis, path_motif, exp_design_name) # regroup all motifs #regroup_motifs(path_motif, exp_design_name) # Select motif centrally enriched -> Could be done automatically #evalue_cutoff = 0.00001 #evalue_cutoff = 0.01 #select_centrally_enriched(path_analysis, path_motif, exp_design_name, evalue_cutoff) # Manually filter motif list and put it in path_motif # add new motif #motif = 'RRACH' #motif="NGGACN" #create_motif(path_motif, motif) #motif = 'NBCAN' #add_motif(path_motif, motif, motif_list_filename) # put all motifs figures together to filter the list # regroup_figures(path_motif, motif_list_filename) path_analysis = pathTars + 'PeakDiffExpression/' + exp_design_name + '/Motif/' #path_motif = path_analysis path_motif = pathTars + 'PeakDiffExpression/Motif/' #motif_list_filename = 'Motif_'+exp_design_name+'_count' motif_list_filename = 'Motif_' + exp_design_name + "_Meth" path_motif_list = path + 'PeakDiffExpression' + os.sep + 'Motif' + os.sep + motif_list_filename fimo_file = path + 'PeakDiffExpression'+os.sep+'Motif' + os.sep + 'Fimo_' + exp_design_name + "_Meth"+ '.sh' # write sh file for running fimo run_fimo(path_analysis, exp_design_name, path_motif, path_motif_list, fimo_file) # regroup everything in one .sh file #print('cat '+path_motif+ '/Fimo_* > ' + path_motif + '/Fimo.sh') # write sh file for running centrimo #run_centrimo(path_analysis, exp_design_name, path_motif, motif_list_filename) # # exp_design_name = 'Liver_all_MaxMaxValues_1' # path_analysis = path + 'PeakDiffExpression/' + exp_design_name + '/Motif/' # path_motif = path + 'PeakDiffExpression/Motif/' # motif_list_filename = 'Motif_' + exp_design_name # path_motif_list = path + 'PeakDiffExpression/Motif/' + motif_list_filename # # # create a table grouping peak marginal score for each motif # create_motif_table(exp_design_name, path_motif, path_motif_list) # # exp_design_name = 'Liver_all_MaxMaxValues_3' # path_analysis = path + 'PeakDiffExpression/' + exp_design_name + '/Motif/' # path_motif = path + 'PeakDiffExpression/Motif/' # motif_list_filename = 'Motif_' + exp_design_name # path_motif_list = path + 'PeakDiffExpression/Motif/' + motif_list_filename # # # create a table grouping peak marginal score for each motif # create_motif_table(exp_design_name, path_motif, path_motif_list) # # exp_design_name = 'Cecum_all_MaxMaxValues_1' # path_analysis = path + 'PeakDiffExpression/' + exp_design_name + '/Motif/' # path_motif = path + 'PeakDiffExpression/Motif/' # motif_list_filename = 'Motif_' + exp_design_name # path_motif_list = path + 'PeakDiffExpression/Motif/' + motif_list_filename # # # create a table grouping peak marginal score for each motif # create_motif_table(exp_design_name, path_motif, path_motif_list) # get size fasta #get_size_fasta(path_analysis, exp_design_name) # Count marginal score and marginal motif presence for every fasta file #motif_vs_fasta(path_analysis, exp_design_name, path_motif, motif_list_filename) # Using R run motif analysis for clustering of motif and biocondition # use clustered list of motif to recreate figure #regroup_figures(path_motif, motif_list_filename + '_' + exp_design_name)