Newer
Older
#!/usr/bin/env python
from collections import defaultdict
import os
from shutil import copyfile
import pandas as pd
from Bio import SeqIO
import os.path
SOFTWARES = {'meme','dreme','centrimo','meme_tomtom'}
##TYPES_SEARCHES = {'All','Narrow','Const','Max'}
def parse_all_logs(path_analysis, exp_design_name):
with open(path_analysis + 'Motif_search.log', 'w') as general_log:
header = 'Data_name\t' + '\t'.join(SOFTWARES)
general_log.write(header+'\n')
for data_name in os.listdir(path_analysis + 'results' + os.sep):
if not data_name.startswith('.') and not data_name.endswith('.log') and not data_name.endswith('.sh'):
progress_filename = path_analysis + 'results' + os.sep + data_name + os.sep + 'progress_log.txt'
line = data_name + '\t'
for software in SOFTWARES:
found = False
with open(progress_filename,'r') as file:
for row in file:
status_soft = 'name: '+software+' status: 0'
if status_soft in row:
found = True
if found:
line += '1\t'
else:
line += '0\t'
general_log.write(line.strip() + '\n')
def parse_dreme(path_analysis, path_motif, exp_design_name):
with open(path_analysis + 'dreme.sh', 'w') as dreme_file:
for data_name in os.listdir(path_analysis + 'results' + os.sep):
if exp_design_name in data_name:
print(data_name)
if not data_name.startswith('.') and not data_name.endswith('.log') and not data_name.endswith('.sh'):
# print(data_name)
dreme_xml = path_analysis + 'results' + os.sep + data_name + os.sep + 'dreme_out' + \
os.sep + 'dreme.xml'
if os.path.exists(dreme_xml):
dreme_list = list()
with open(dreme_xml, 'r') as file:
for row in file:
dreme_list.append(row.strip())
if '<command_line>' in row:
dreme_file.write(row)
dreme_txt = path_analysis + 'results' + os.sep + data_name + os.sep + 'dreme_out' \
+ os.sep + 'dreme.txt'
dreme_txt_list = list()
with open(dreme_txt, 'r') as file:
for row in file:
dreme_txt_list.append(row.strip())
dreme_txt_intro = ''
write_intro = False
for row in dreme_txt_list:
if row.startswith('MOTIF '):
break
if 'MEME version' in row:
write_intro = True
if write_intro:
dreme_txt_intro += row + '\n'
#print(dreme_txt_intro)
# parse dreme
with open(path_motif + os.sep + 'logs' + os.sep + 'Dreme_' + data_name + '.log', 'w') as general_log:
header = 'ID\tMotif\tNb_sites\tPos_Occurence\tNeg_Occurence\tNb_Seq\t' \
'Percent\tPos_Percent\tNeg_Percent\tEvalue'
general_log.write(header + '\n')
# search length fasta
for line in dreme_list:
if '<positives name=' in line:
number_seq = float(line.split(' ')[2].replace('\"','').replace('count=',''))
# search motifs
for i in range(1, len(dreme_list)):
line = dreme_list[i]
if '<motifs>' in line:
index_motifs = i
break
# get motifs info
for k in range(index_motifs+1, len(dreme_list)):
if '<motif' in dreme_list[k]:
new_line = dreme_list[k].replace('\"','').split(' ')
#print(new_line)
id = new_line[1].replace('id=', '')
#print(id,data_name)
seq = new_line[2].replace('seq=', '')
occ = float(new_line[4].replace('nsites=', ''))
pos_occ = float(new_line[5].replace('p=', ''))
neg_occ = float(new_line[6].replace('n=', ''))
evalue = new_line[8].replace('evalue=', '')
# save table
line_to_write = [id, seq, str(occ), str(pos_occ),str(neg_occ), str(number_seq),
str(occ/number_seq), str(pos_occ/number_seq), str(neg_occ/number_seq),
str(evalue)]
general_log.write('\t'.join(line_to_write) + '\n')
# copy png file
for imagefile in os.listdir(path_analysis + 'results/' + data_name + '/dreme_out/'):
if id in imagefile:
if 'nc_' in imagefile:
src = path_analysis + 'results'+ os.sep + data_name + os.sep + \
'dreme_out' + os.sep + imagefile
dst = path_motif + os.sep + 'motif_figure' + os.sep + seq + '_nc.png'
src = path_analysis + 'results' + os.sep + data_name + \
os.sep + 'dreme_out' + os.sep + imagefile
dst = path_motif + os.sep + 'motif_figure' + os.sep + seq + '_rc.png'
copyfile(src, dst)
# save motif
with open(path_motif + os.sep + 'motif' + os.sep + seq + '.meme', 'w') as meme_file:
meme_file.write(dreme_txt_intro+'\n')
# find motif info
dreme_txt_motif = ''
write_motif = False
for row in dreme_txt_list:
if write_motif and row.startswith('MOTIF '):
break
if ('MOTIF '+seq+' DREME') in row:
write_motif = True
if write_motif:
dreme_txt_motif += row + '\n'
meme_file.write(dreme_txt_motif + '\n')
motif_set = set()
dict_dataset = defaultdict(list)
dict_evalue = defaultdict(list)
for data_name in os.listdir(path_motif + 'logs' + os.sep):
if data_name.startswith('Dreme_') and data_name.endswith('.log') and "-" in data_name :# exp_design_name in data_name:
type_data = data_name.replace('Dreme_','').replace('.log','')
print(type_data, data_name)
with open(path_motif + 'logs/' + data_name,'r') as log_file:
log_file.readline()
for line in log_file:
motif = line.split('\t')[1]
evalue = line.strip().split('\t')[-1]
motif_set.add(motif)
dict_dataset[motif].append(type_data)
dict_evalue[motif].append(evalue)
with open(path_motif + 'Motif_'+exp_design_name+'_Diff_raw.txt','w') as motif_log:
motif_log.write("Motif\tData\tP-value\n")
for motif in motif_set:
motif_log.write(motif+'\t'+';'.join(dict_dataset[motif])+'\t'+';'.join(dict_evalue[motif])+'\n')
def select_centrally_enriched(path_analysis, path_motif, exp_design_name, evalue_cutoff):
centrimo_motif = dict()
for data_name in os.listdir(path_motif + 'logs/'):
if data_name.startswith('Dreme_') and data_name.endswith('.log') and "-" in data_name :# exp_design_name in data_name:
type_data = data_name.replace('Dreme_','').replace('.log','')
centrimo_path = path_analysis + '/results/' + type_data + '/centrimo_out/centrimo.txt'
if os.path.exists(centrimo_path):
with open(centrimo_path, 'r') as centrimo_file:
for line in centrimo_file:
if 'DREME' in line:
motif = line.split('\t')[1].split(' ')[0]
print(motif, type_data)
centrimo_motif[motif] = type_data
print(len(centrimo_motif))
with open(path_motif + 'Motif_' + exp_design_name + '_Diff_raw.txt', 'r') as motif_log, \
open(path_motif + 'Motif_' + exp_design_name + '_Diff.txt', 'w') as motif_filter_log:
motif_filter_log.write("Motif\tData\tP-value\tCentrally Enriched\n")
motif_log.readline()
for line in motif_log:
print(line.split('\t')[0])
if line.split('\t')[0] in centrimo_motif:
evalue_list = line.split('\t')[2].split(';')
min_evalue = 1
for evalue in evalue_list:
if float(evalue)<min_evalue:
min_evalue = float(evalue)
if min_evalue < evalue_cutoff:
motif_filter_log.write(line.strip()+'\tYes\n')
motif_file = path_motif + '/motif/' + motif + '.meme'
motif_figure_file = path_motif + '/motif_figure/' + motif + '_nc.eps'
# motif_figure_file = path_motif + '/motif_figure/' + motif + '_nc.png'
# Meme-suite as to be installed for using = iupac2meme
iupac2_command = 'iupac2meme -dna ' + motif + ' > ' + motif_file
print(iupac2_command)
os.system(iupac2_command)
ceqlogo_command = 'ceqlogo -i ' + motif_file + ' -m ' + motif + ' -o ' + motif_figure_file + ' -f eps'
# ceqlogo_command = 'ceqlogo -i ' + motif_file + ' -m ' + motif + ' -o '+motif_figure_file+' -f png'
print('Run ' + ceqlogo_command)
os.system(ceqlogo_command)
def regroup_figures(path_motif, motif_list):
'''
Create SVG file with all motifs included
:param path_motif:
:param motif_list:
:return:
'''
print('Create PNG file with all motifs')
df_summary = pd.read_csv(path_motif + motif_list + '.txt', sep='\t')
#df_summary = df_summary.sort_values(by=['Motif'], ascending=[True])
svg_text = '<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!-- Generator: Adobe Illustrator 15.0.0, SVG Export ' \
'Plug-In . SVG Version: 6.00 Build 0) -->\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\" \"' \
'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<svg version=\"1.1\" id=\"Calque_1\" ' \
'xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" x=\"0px\" y=\"0px\"' \
' width=\"1300px\" height=\"'+str(len(df_summary.index)*100)+'px\" viewBox=\"0 0 1300 '\
+str(len(df_summary.index)*100)+'\" xml:space=\"preserve\">\n'
i=0
for index, row in df_summary.iterrows():
motif = row['Motif']
data = row['Data']
pvalue = row['P-value']
image_file = 'motif_figure/' + motif + '_nc.png'
new_row = '<image overflow=\"visible\" width=\"299\" height=\"176\" xlink:href=\"'+image_file+'\" ' \
'transform=\"matrix(0.5184 0 0 0.5184 10.00 '+ str(float(100*i))+')\">\n</image>\n'
image_file = 'motif_figure/' + motif + '_rc.png'
new_row += '<image overflow=\"visible\" width=\"299\" height=\"176\" xlink:href=\"' + image_file + '\" ' \
'transform=\"matrix(0.5184 0 0 0.5184 170.00 '+ str(float(100*i))+')\">\n</image>\n'
new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 340 '+ str(float(50+100*i))+')\" font-family=' \
'\"\'MyriadPro-Regular\'\" font-size=\"40\">'+data+'</text>\n'
new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 800 '+ str(float(50+100*i))+')\" font-family=' \
'\"\'MyriadPro-Regular\'\" font-size=\"40\">'+pvalue+'</text>\n'
#new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 1000 ' + str(float(50 + 100 * i)) + ')\" font-family=' \
# '\"\'MyriadPro-Regular\'\" font-size=\"40\">' + comment + '</text>\n'
svg_text += new_row
i+=1
svg_text += '</svg>\n'
with open(path_motif + motif_list + '.svg', 'w') as figure_file:
figure_file.write(svg_text)
print('Convert svg to png')
os.system('convert '+path_motif + motif_list + '.svg '+path_motif + motif_list + '.png')
def run_fimo(path_analysis, exp_design_name, path_motif, path_motif_list, fimo_file):
'''
Creates a fimo.sh file with all the command to run
:param path_analysis: path where the fasta files, background, and fimo results should be
:param exp_design_name: name of the peaks : example: CecAm_Raw_1
:param path_motif: path where the motif will be found
:param motif_list_filename: path of the motif list
:return: save file to path_motif + 'Fimo.sh'
'''
with open(path_motif_list + '.txt', 'r') as motif_file:
for row in motif_file:
motif_list.append(row.split('\t')[0].strip())
motif_file = path_motif + 'motif/' + motif + '.meme'
fasta_file = path_analysis + 'fasta/' + data_name + '.fasta'
background_file = path_analysis + 'results/' + data_name + '/background'
folder_fimo = path_motif + 'fimo/' + motif + '_' + data_name
fimo_sh = 'fimo --parse-genomic-coord --verbosity 1 --thresh 1e-2 --max-stored-scores 1000000 ' \
'--oc ' + folder_fimo + ' --bgfile ' + background_file + ' ' + motif_file + ' ' + fasta_file
print(fimo_sh)
motif_sh.write(fimo_sh+'\n')
def run_centrimo(path_analysis, exp_design_name, path_motif, motif_list_filename):
'''
Creates a centrimo.sh file with all the command to run
:param path_analysis: path where the fasta files, background, and fimo results should be
:param exp_design_name: name of the peaks : example: CecAm_Raw_1
:param path_motif: path where the motif will be found
:param motif_list_filename: path of the motif list
:return: save file to path_motif + 'Centrimo.sh'
'''
motif_list = list()
with open(path_motif + motif_list_filename + '.txt', 'r') as motif_file:
for row in motif_file:
motif_list.append(row.split('\t')[0].strip())
with open(path_motif + 'Centrimo.sh', 'w') as motif_sh:
for motif in motif_list:
motif_file = path_analysis + 'motif/' + motif + '.meme'
fasta_file = path_analysis + 'fasta/' + data_name + '.fasta'
background_file = path_analysis + 'results/' + data_name + '/background'
folder_centrimo = path_analysis + 'centrimo/' + motif + '_' + data_name
centrimo_sh = 'centrimo -seqlen 150 -verbosity 1 -oc ' + folder_centrimo +\
' --bgfile ' + background_file + ' -score 5.0 '\
'-ethresh 10.0 ' + fasta_file + ' '+motif_file
print(centrimo_sh)
motif_sh.write(centrimo_sh + '\n')
def create_motif_table(exp_design_name, path_motif, path_motif_list):
'''
Creates two tables summarizing occurence of motif for each peak
:param path_analysis: path where the fasta files, background, and fimo results should be
:param exp_design_name: name of the peaks : example: CecAm_Raw_1
:param path_motif: path where the motif will be found
:param motif_list_filename: path of the motif list
:return: save file to path_analysis + motif_list_filename + '_score.txt'
and path_analysis + motif_list_filename + '_count.txt'
'''
with open(path_motif_list + '.txt', 'r') as motif_file:
for row in motif_file:
motif_list.append(row.split('\t')[0].strip())
print(path_analysis + 'fasta/' + exp_design_name + '.fasta' )
fasta_seq = SeqIO.to_dict(SeqIO.parse(path_analysis + 'fasta/' + exp_design_name + '.fasta' , "fasta"))
nb_peaks = len(fasta_seq.keys())
np_motif = np.zeros((nb_peaks, len(motif_list)))
df_motif_score = pd.DataFrame(np_motif, index=fasta_seq.keys(), columns=motif_list)
np_motif = np.zeros((nb_peaks, len(motif_list)))
df_motif_count = pd.DataFrame(np_motif, index=fasta_seq.keys(), columns=motif_list)
np_motif = np.ones((nb_peaks, len(motif_list)))
df_motif_pvalue = pd.DataFrame(np_motif, index=fasta_seq.keys(), columns=motif_list)
for motif in motif_list:
print(motif)
df_motif_peaks = pd.read_csv(path_motif + 'fimo/' + motif + '_' + data_name +
grouped = df_motif_peaks[['sequence name','score']].groupby('sequence name')
score = grouped.sum()
df_motif_score[motif] = score
counted = grouped.count()
df_motif_count[motif] = counted
df_motif_score.to_csv(path_motif_list + '_score.txt', sep='\t')
df_motif_count.to_csv(path_motif_list + '_count.txt', sep='\t')
#df_motif_pvalue.to_csv(path_motif + exp_design_name + '_' + motif_list_filename + '_pvalue.txt', sep='\t')
def get_size_fasta(path_analysis, exp_design_name):
with open(path_analysis + 'Fasta_Summary.txt', 'w') as fasta_summary:
for data_name in os.listdir(path_analysis+'fasta/'):
if data_name.endswith('.fasta'):
print(data_name)
fasta_seq = SeqIO.to_dict(SeqIO.parse(path_analysis+'fasta/'+data_name, "fasta"))
print(data_name,len(fasta_seq))
fasta_summary.write(data_name.replace('.fasta','')+'\t'+str(len(fasta_seq))+'\n')
def motif_vs_fasta(path_analysis, exp_design_name, path_motif, motif_list_filename):
'''
Creates two tables summarizing occurence of motif for each fasta file
:param path_analysis: path where the fasta files, background, and fimo results should be
:param exp_design_name: name of the peaks : example: CecAm_Raw_1
:param path_motif: path where the motif will be found
:param motif_list_filename: path of the motif list
:return: save file to path_analysis + motif_list_filename + '_score.txt'
and path_analysis + motif_list_filename + '_count.txt'
'''
# read fasta summary
fasta_to_size = dict()
with open(path_analysis + 'Fasta_Summary.txt', 'r') as fasta_summary:
fasta_name = row.split('\t')[0]
fasta_size = row.split('\t')[1].strip()
if fasta_size != '0':
fasta_to_size[fasta_name] = fasta_size
# read list
motif_list = list()
with open(path_motif + motif_list_filename + '.txt', 'r') as motif_file:
for row in motif_file:
motif_list.append(row.split('\t')[0].strip())
df_motif_score = pd.read_csv(path_analysis + motif_list_filename + '_score.txt', index_col=0, sep='\t')
df_motif_count = pd.read_csv(path_analysis + motif_list_filename + '_count.txt', index_col=0, sep='\t')
# create motif vs fasta tables
nb_fasta = len(fasta_to_size.keys())
np_fasta = np.zeros((len(motif_list), nb_fasta))
df_motif_fasta_score = pd.DataFrame(np_fasta, index=motif_list, columns=fasta_to_size.keys())
np_fasta = np.zeros((len(motif_list), nb_fasta))
df_motif_fasta_count = pd.DataFrame(np_fasta, index=motif_list, columns=fasta_to_size.keys())
print(motif_list)
# go through all fasta file
for fasta_name, fasta_size in fasta_to_size.items():
fasta_peaks = SeqIO.to_dict(SeqIO.parse(path_analysis+'fasta/'+fasta_name+'.fasta', 'fasta'))
#print(fasta_peaks.keys())
for motif in motif_list:
motif_score = df_motif_score.loc[fasta_peaks.keys(), motif].sum()
df_motif_fasta_score[fasta_name][motif] = float(motif_score)/float(fasta_size)
motif_count = df_motif_count.loc[fasta_peaks.keys(), motif].sum()
df_motif_fasta_count[fasta_name][motif] = float(motif_count)/float(fasta_size)
df_motif_fasta_score.to_csv(path_analysis + motif_list_filename + '_fasta_score.txt', sep='\t')
df_motif_fasta_count.to_csv(path_analysis + motif_list_filename + '_fasta_count.txt', sep='\t')
#exp_design_name = 'LivOld'
exp_design_name = 'Cecum_all_withoutabxGF_MaxMaxValues_3'
#path = '/Volumes/m6aAkker/'
path = 'Z:' + os.sep
path_analysis = path + 'PeakDiffExpression' + os.sep + exp_design_name + os.sep + 'Motif' + os.sep
path_motif = path + 'PeakDiffExpression' + os.sep + 'Motif' + os.sep
#motif_list_filename = 'Motif_'+exp_design_name+'_count'
motif_list_filename = 'Motif_' + exp_design_name
path_motif_list = path_analysis + motif_list_filename
# Create all folders for motif analysis
# if not os.path.isdir(path_motif + 'logs' + os.sep):
# os.mkdir(path_motif + 'logs' + os.sep)
# if not os.path.isdir(path_motif + 'motif' + os.sep):
# os.mkdir(path_motif + 'motif' + os.sep)
# if not os.path.isdir(path_motif + 'motif_figure' + os.sep):
# os.mkdir(path_motif + 'motif_figure' + os.sep)
# if not os.path.isdir(path_motif + 'fimo' + os.sep):
# os.mkdir(path_motif + 'fimo' + os.sep)
# if not os.path.isdir(path_motif + 'centrimo' + os.sep):
# os.mkdir(path_motif + 'centrimo' + os.sep)
# first look at log to see which software crashed
#parse_all_logs(path_analysis, exp_design_name)
# Parse meme results and extract all motifs
#parse_dreme(path_analysis, path_motif, exp_design_name)
#regroup_motifs(path_motif, exp_design_name)
# Select motif centrally enriched -> Could be done automatically
#evalue_cutoff = 0.00001
#evalue_cutoff = 0.01
#select_centrally_enriched(path_analysis, path_motif, exp_design_name, evalue_cutoff)
# Manually filter motif list and put it in path_motif
#motif = 'NBCAN'
#add_motif(path_motif, motif, motif_list_filename)
# put all motifs figures together to filter the list
# regroup_figures(path_motif, motif_list_filename)
path_analysis = pathTars + 'PeakDiffExpression/' + exp_design_name + '/Motif/'
#path_motif = path_analysis
path_motif = pathTars + 'PeakDiffExpression/Motif/'
#motif_list_filename = 'Motif_'+exp_design_name+'_count'
motif_list_filename = 'Motif_' + exp_design_name + "_Meth"
path_motif_list = path + 'PeakDiffExpression' + os.sep + 'Motif' + os.sep + motif_list_filename
fimo_file = path + 'PeakDiffExpression'+os.sep+'Motif' + os.sep + 'Fimo_' + exp_design_name + "_Meth"+ '.sh'
# write sh file for running fimo
run_fimo(path_analysis, exp_design_name, path_motif, path_motif_list, fimo_file)
# regroup everything in one .sh file
#print('cat '+path_motif+ '/Fimo_* > ' + path_motif + '/Fimo.sh')
# write sh file for running centrimo
#run_centrimo(path_analysis, exp_design_name, path_motif, motif_list_filename)
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
#
# exp_design_name = 'Liver_all_MaxMaxValues_1'
# path_analysis = path + 'PeakDiffExpression/' + exp_design_name + '/Motif/'
# path_motif = path + 'PeakDiffExpression/Motif/'
# motif_list_filename = 'Motif_' + exp_design_name
# path_motif_list = path + 'PeakDiffExpression/Motif/' + motif_list_filename
#
# # create a table grouping peak marginal score for each motif
# create_motif_table(exp_design_name, path_motif, path_motif_list)
#
# exp_design_name = 'Liver_all_MaxMaxValues_3'
# path_analysis = path + 'PeakDiffExpression/' + exp_design_name + '/Motif/'
# path_motif = path + 'PeakDiffExpression/Motif/'
# motif_list_filename = 'Motif_' + exp_design_name
# path_motif_list = path + 'PeakDiffExpression/Motif/' + motif_list_filename
#
# # create a table grouping peak marginal score for each motif
# create_motif_table(exp_design_name, path_motif, path_motif_list)
#
# exp_design_name = 'Cecum_all_MaxMaxValues_1'
# path_analysis = path + 'PeakDiffExpression/' + exp_design_name + '/Motif/'
# path_motif = path + 'PeakDiffExpression/Motif/'
# motif_list_filename = 'Motif_' + exp_design_name
# path_motif_list = path + 'PeakDiffExpression/Motif/' + motif_list_filename
#
# # create a table grouping peak marginal score for each motif
# create_motif_table(exp_design_name, path_motif, path_motif_list)
# get size fasta
#get_size_fasta(path_analysis, exp_design_name)
# Count marginal score and marginal motif presence for every fasta file
#motif_vs_fasta(path_analysis, exp_design_name, path_motif, motif_list_filename)
# Using R run motif analysis for clustering of motif and biocondition
# use clustered list of motif to recreate figure
#regroup_figures(path_motif, motif_list_filename + '_' + exp_design_name)