Skip to content
Snippets Groups Projects
parse_motif_search.py 36 KiB
Newer Older
#!/usr/bin/env python
from collections import defaultdict
import os
from shutil import copyfile
import pandas as pd
import numpy as np
from Bio import SeqIO
import os.path


SOFTWARES = {'meme','dreme','centrimo','meme_tomtom'}
##TYPES_SEARCHES = {'All','Narrow','Const','Max'}


def parse_all_logs(path_analysis, exp_design_name):
    with open(path_analysis + 'Motif_search.log', 'w') as general_log:
        header = 'Data_name\t' + '\t'.join(SOFTWARES)
        general_log.write(header+'\n')
        for data_name in os.listdir(path_analysis + 'results/'):
            if not data_name.startswith('.') and not data_name.endswith('.log') \
                    and not data_name.endswith('.sh'): # and '_MaxMaxValues' in data_name:
                progress_filename = path_analysis + 'results/' + data_name + '/progress_log.txt'
                line = data_name + '\t'
                for software in SOFTWARES:
                    found = False
                    with open(progress_filename,'r') as file:
                        for row in file:
                            status_soft = 'name: '+software+'  status: 0'
                            if status_soft in row:
                                found = True
                    if found:
                        line += '1\t'
                    else:
                        line += '0\t'
                general_log.write(line.strip() + '\n')


christophebecavin's avatar
christophebecavin committed
def parse_dreme(path_analysis, path_motif, exp_design_name):
    with open(path_analysis + 'dreme.sh', 'w') as dreme_file:
        for data_name in os.listdir(path_analysis + 'results/'):
            #if exp_design_name in data_name:
            print(data_name)
            if not data_name.startswith('.') and not data_name.endswith('.log') \
                    and not data_name.endswith('.sh'): # and '_MaxMaxValues' in data_name:
                # print(data_name)
                dreme_xml = path_analysis + 'results/' + data_name + '/dreme_out/dreme.xml'
                if os.path.exists(dreme_xml):
                    dreme_list = list()
                    with open(dreme_xml, 'r') as file:
                        for row in file:
                            dreme_list.append(row.strip())
                            if '<command_line>' in row:
                                dreme_file.write(row)

                    dreme_txt = path_analysis + 'results/' + data_name + '/dreme_out/dreme.txt'
                    dreme_txt_list = list()
                    with open(dreme_txt, 'r') as file:
                        for row in file:
                            dreme_txt_list.append(row.strip())
                    dreme_txt_intro = ''
                    write_intro = False
                    for row in dreme_txt_list:
                        if row.startswith('MOTIF '):
                            break
                        if 'MEME version' in row:
                            write_intro = True
                        if write_intro:
                            dreme_txt_intro += row + '\n'

                    #print(dreme_txt_intro)

                    # parse dreme
                    with open(path_motif + '/logs/Dreme_' + data_name + '.log', 'w') as general_log:
                        header = 'ID\tMotif\tNb_sites\tPos_Occurence\tNeg_Occurence\tNb_Seq\t' \
                                 'Percent\tPos_Percent\tNeg_Percent\tEvalue'
                        general_log.write(header + '\n')
                        # search length fasta
                        for line in dreme_list:
                            if '<positives name=' in line:
                                number_seq = float(line.split(' ')[2].replace('\"','').replace('count=',''))
                        # search motifs
                        for i in range(1, len(dreme_list)):
                            line = dreme_list[i]
                            if '<motifs>' in line:
                                index_motifs = i

                        # get motifs info
                        for k in range(index_motifs+1, len(dreme_list)):
                            if '<motif' in dreme_list[k]:
                                new_line = dreme_list[k].replace('\"','').split(' ')
                                #print(new_line)
                                id = new_line[1].replace('id=', '')
                                #print(id,data_name)
                                print(dreme_txt_intro)
                                seq = new_line[2].replace('seq=', '')
                                occ = float(new_line[4].replace('nsites=', ''))
                                pos_occ = float(new_line[5].replace('p=', ''))
                                neg_occ = float(new_line[6].replace('n=', ''))
                                evalue = new_line[8].replace('evalue=', '')

                                # save table
                                line_to_write = [id, seq, str(occ), str(pos_occ),str(neg_occ), str(number_seq),
                                                 str(occ/number_seq), str(pos_occ/number_seq), str(neg_occ/number_seq),
                                                 str(evalue)]
                                general_log.write('\t'.join(line_to_write) + '\n')

                                # copy png file
                                for imagefile in os.listdir(path_analysis + 'results/' + data_name + '/dreme_out/'):
                                    if id in imagefile:
                                        if 'nc_' in imagefile:
                                            src = path_analysis + 'results/' + data_name + '/dreme_out/' + imagefile
                                            dst = path_motif + '/motif_figure/'+seq + '_nc.png'
                                            copyfile(src, dst)
                                        else:
                                            src = path_analysis + 'results/' + data_name + '/dreme_out/' + imagefile
                                            dst = path_motif + '/motif_figure/' + seq + '_rc.png'
                                            copyfile(src, dst)

                                # save motif
                                with open(path_motif + '/motif/' + seq + '.meme', 'w') as meme_file:
                                    meme_file.write(dreme_txt_intro+'\n')
                                    # find motif info
                                    dreme_txt_motif = ''
                                    write_motif = False
                                    for row in dreme_txt_list:
                                        if write_motif and row.startswith('MOTIF '):
                                            break
                                        if ('MOTIF '+seq+' DREME') in row:
                                            write_motif = True
                                        if write_motif:
                                            dreme_txt_motif += row + '\n'

                                    print(dreme_txt_motif)
                                    meme_file.write(dreme_txt_motif + '\n')


def regroup_motifs(path_motif, exp_design_name, pvalue_cutoff):
    motif_set = set()
    dict_dataset = defaultdict(list)
    dict_evalue = defaultdict(list)
    #data_selection = 'NoabxGF.'
    for data_name in os.listdir(path_motif + 'logs/'):
        if data_name.startswith('Dreme_') and data_name.endswith('.log') and exp_design_name in data_name:
            type_data = data_name.replace('Dreme_','').replace('.log','')
            if 'Meth' in type_data:
                #if data_selection in data_name:
                print(type_data, data_name)
                with open(path_motif + 'logs/' + data_name,'r') as log_file:
                    log_file.readline()
                    for line in log_file:
                        motif = line.split('\t')[1]
                        evalue = line.strip().split('\t')[-1]
                        if float(evalue) < pvalue_cutoff:
                            motif_set.add(motif)
                            print(evalue,motif)
                            dict_dataset[motif].append(type_data)
                            dict_evalue[motif].append(evalue)
            else:
                print(type_data, data_name)
                with open(path_motif + 'logs/' + data_name, 'r') as log_file:
                    log_file.readline()
                    for line in log_file:
                        motif = line.split('\t')[1]
                        evalue = line.strip().split('\t')[-1]
                        if float(evalue) < pvalue_cutoff:
                            motif_set.add(motif)
                            print(evalue, motif)
                            dict_dataset[motif].append(type_data)
                            dict_evalue[motif].append(evalue)


    with open(path_motif + 'Motif_'+exp_design_name+ '__raw.txt','w') as motif_log:
         motif_log.write("Motif\tData\tP-value\n")
         for motif in motif_set:
             print(motif+'\t'+';'.join(dict_dataset[motif]))
             print(motif+'\t'+';'.join(dict_dataset[motif])+'\t'+';'.join(dict_evalue[motif])+'\n')
             motif_log.write(motif+'\t'+';'.join(dict_dataset[motif])+'\t'+';'.join(dict_evalue[motif])+'\n')


def regroup_motifs_diff(path_motif, exp_design_name, pvalue_cutoff):
    '''
    Regroup all motifs coming from comparisons of biological conditions
    :param path_motif:
    :param exp_design_name:
    :param pvalue_cutoff:
    :return:
    '''
    motif_set = set()
    dict_dataset = defaultdict(list)
    dict_evalue = defaultdict(list)
    for data_name in os.listdir(path_motif + 'logs/'):
        if data_name.startswith('Dreme_') and data_name.endswith('.log') and '_MaxMaxValues' not in data_name:
            type_data = data_name.replace('Dreme_','').replace('.log','')
christophebecavin's avatar
christophebecavin committed
            print(type_data, data_name)
            with open(path_motif + 'logs/' + data_name,'r') as log_file:
                log_file.readline()
                for line in log_file:
                    motif = line.split('\t')[1]
                    evalue = line.strip().split('\t')[-1]
                    if float(evalue) < pvalue_cutoff:
                        motif_set.add(motif)
                        print(evalue,motif)
                        dict_dataset[motif].append(type_data)
                        dict_evalue[motif].append(evalue)
    with open(path_motif + 'Motif_'+exp_design_name+'_raw.txt','w') as motif_log:
        motif_log.write("Motif\tData\tP-value\n")
        for motif in motif_set:
            motif_log.write(motif+'\t'+';'.join(dict_dataset[motif])+'\t'+';'.join(dict_evalue[motif])+'\n')


def select_centrally_enriched_diff(path_analysis, path_motif, exp_design_name):
    '''
    Add a column indicating if motif are centrally enriched, and select only motif coming form a comparisons.
    Meaning : exp_design_name not in D
    :param path_analysis:
    :param path_motif:
    :param exp_design_name:
    :return:
    '''
christophebecavin's avatar
christophebecavin committed
    centrimo_motif = dict()
    for data_name in os.listdir(path_motif + 'logs/'):
        if data_name.startswith('Dreme_') and data_name.endswith('.log') and exp_design_name in data_name:
christophebecavin's avatar
christophebecavin committed
            type_data = data_name.replace('Dreme_','').replace('.log','')
            centrimo_path = path_analysis + '/results/' + type_data + '/centrimo_out/centrimo.txt'
            if os.path.exists(centrimo_path):
                with open(centrimo_path, 'r') as centrimo_file:
                    for line in centrimo_file:
                        if 'DREME' in line:
                            motif = line.split('\t')[1].split(' ')[0]
                            print(motif, type_data)
                            centrimo_motif[motif] = type_data

    print(len(centrimo_motif))

    with open(path_motif + 'Motif_' + exp_design_name + '_raw.txt', 'r') as motif_log, \
becavin christophe's avatar
becavin christophe committed
            open(path_motif + 'Motif_' + exp_design_name + '_Diff.txt', 'w') as motif_filter_log:
        motif_filter_log.write("Motif\tData\tP-value\tCentrallyEnriched\n")
christophebecavin's avatar
christophebecavin committed
        motif_log.readline()
        for line in motif_log:
            print(line.split('\t')[0])
            found = False
            datas = line.split('\t')[1].split(';')
            for data in datas:
                print(data)
christophebecavin's avatar
christophebecavin committed
            if line.split('\t')[0] in centrimo_motif:
                motif_filter_log.write(line.strip()+'\tYes\n')
            else:
                motif_filter_log.write(line.strip() + '\tNo\n')


def select_centrally_enriched(path_analysis, path_motif, exp_design_name):
    #data_selection = 'NoLP.'

    centrimo_motif = dict()
    for data_name in os.listdir(path_motif + 'logs/'):
        if data_name.startswith('Dreme_') and data_name.endswith('.log') and exp_design_name in data_name:
            type_data = data_name.replace('Dreme_','').replace('.log','')
            centrimo_path = path_analysis + '/results/' + type_data + '/centrimo_out/centrimo.txt'
            if os.path.exists(centrimo_path):
                with open(centrimo_path, 'r') as centrimo_file:
                    for line in centrimo_file:
                        if 'DREME' in line:
                            motif = line.split('\t')[1].split(' ')[0]
                            print(motif, type_data)
                            centrimo_motif[motif] = type_data

    print(len(centrimo_motif))

    with open(path_motif + 'Motif_' + exp_design_name + '_raw.txt', 'r') as motif_log, \
            open(path_motif + 'Motif_' + exp_design_name + '.txt', 'w') as motif_filter_log:
        motif_filter_log.write("Motif\tData\tP-value\tCentrallyEnriched\n")
        motif_log.readline()
        for line in motif_log:
            print(line.split('\t')[0])
            if line.split('\t')[0] in centrimo_motif:
                motif_filter_log.write(line.strip()+'\tYes\n')
def create_motif(path_motif, motif):
    motif_file = path_motif + '/motif/' + motif + '.meme'
Christophe  BECAVIN's avatar
Christophe BECAVIN committed
    motif_figure_file = path_motif + '/motif_figure/' + motif + '_nc.eps'
#    motif_figure_file = path_motif + '/motif_figure/' + motif + '_nc.png'
    # Meme-suite as to be installed for using = iupac2meme
    iupac2_command = 'iupac2meme -dna ' + motif + ' > ' + motif_file
    print(iupac2_command)
Christophe  BECAVIN's avatar
Christophe BECAVIN committed
    os.system(iupac2_command)
    ceqlogo_command = 'ceqlogo -i ' + motif_file + ' -m ' + motif + ' -o ' + motif_figure_file + ' -f eps'
#    ceqlogo_command = 'ceqlogo -i ' + motif_file + ' -m ' + motif + ' -o '+motif_figure_file+' -f png'
    print('Run ' + ceqlogo_command)
    os.system(ceqlogo_command)

def regroup_figures(path_motif, motif_list):
    '''
    Create SVG file with all motifs included
    :param path_motif:
    :param motif_list:
    :return:
    '''
    print('Create PNG file with all motifs')
    df_summary = pd.read_csv(path_motif + motif_list + '.txt', sep='\t')
    # Need to split in different part of 20 motifs
    print('Split motif list in figures of 20 moifs each')
    list_df = list()
    k = 0
    # begin = 0
    # for i in range(0, df_summary.shape[0]):
    #     k += 1
    #     if k % 20 == 0:
    #         df_temp = df_summary.iloc[begin:k]
    #         list_df.append(pd.DataFrame(df_temp))
    #         begin = k
    # df_temp = df_summary.iloc[begin:k]
    # list_df.append(df_temp)


    #df_summary = df_summary.sort_values(by=['Motif'], ascending=[True])
    # for k in range(0, len(list_df)):
    #     print(k)
    #     df_summary = list_df[k]

    svg_text = '<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!-- Generator: Adobe Illustrator 15.0.0, SVG Export ' \
               'Plug-In . SVG Version: 6.00 Build 0)  -->\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\" \"' \
               'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<svg version=\"1.1\" id=\"Calque_1\" ' \
               'xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" x=\"0px\" y=\"0px\"' \
               ' width=\"1600px\" height=\"'+str(len(df_summary.index)*100)+'px\" viewBox=\"0 0 1600 ' \
               +str(len(df_summary.index)*100)+'\" xml:space=\"preserve\">\n'
    i=0
    for index, row in df_summary.iterrows():
        motif = row['Motif']
        data = row['Data']
        #expdesign = row['Exp_design']
        pvalue = str(row['P-value'])
        #comment = row['Comment']
        print(motif)
        image_file = path_motif + 'motif_figure/' + motif + '_nc.png'
        new_row = '<image overflow=\"visible\" width=\"299\" height=\"176\" xlink:href=\"'+image_file+'\"  ' \
                                                                                                      'transform=\"matrix(0.5184 0 0 0.5184 10.00 '+ str(float(100*i))+')\">\n</image>\n'
        image_file = path_motif + 'motif_figure/' + motif + '_rc.png'
        if os.path.isfile(image_file):
            new_row += '<image overflow=\"visible\" width=\"299\" height=\"176\" xlink:href=\"' + image_file + '\"  ' \
                                                                                                               'transform=\"matrix(0.5184 0 0 0.5184 170.00 '+ str(float(100*i))+')\">\n</image>\n'
        new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 340 '+ str(float(50+100*i))+')\" font-family=' \
                                                                                           '\"\'MyriadPro-Regular\'\" font-size=\"40\">'+data+'</text>\n'
        #new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 900 ' + str(float(50 + 100 * i)) + ')\" font-family=' \
        #                                                                                          '\"\'MyriadPro-Regular\'\" font-size=\"40\">' + expdesign + '</text>\n'
        new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 1300 '+ str(float(50+100*i))+')\" font-family=' \
                                                                                            '\"\'MyriadPro-Regular\'\" font-size=\"40\">'+pvalue+'</text>\n'
        #new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 1000 ' + str(float(50 + 100 * i)) + ')\" font-family=' \
        #            '\"\'MyriadPro-Regular\'\" font-size=\"40\">' + comment + '</text>\n'
        svg_text += new_row
        i+=1
    svg_text += '</svg>\n'

    with open(path_motif + motif_list + '_' + str(k) + '.svg', 'w') as figure_file:
        figure_file.write(svg_text)
    print('Convert svg to png')
    script = 'convert '+path_motif + motif_list + '_' + str(k)+'.svg '+path_motif + motif_list + '_'+ str(k)+'.png'
    #os.system(script)


def figure_motif_clustering(path_motif, motif_list):
    '''
    Create SVG file with all motifs included
    :param path_motif:
    :param motif_list:
    :return:
    '''
    print('Create SVG file with all motifs')
    motiflist_filename = path_motif + motif_list
    df_summary = pd.read_csv(motiflist_filename + '.txt', sep='\t')
    svg_text = '<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!-- Generator: Adobe Illustrator 15.0.0, SVG Export ' \
               'Plug-In . SVG Version: 6.00 Build 0)  -->\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\" \"' \
               'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<svg version=\"1.1\" id=\"Calque_1\" ' \
               'xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" x=\"0px\" y=\"0px\"' \
               ' width=\"1600px\" height=\"'+str(len(df_summary.index)*100)+'px\" viewBox=\"0 0 1600 ' \
               + str(len(df_summary.index)*100)+'\" xml:space=\"preserve\">\n'
    i = 0
    for index, row in df_summary.iterrows():
        motif = row['Motif']
        data = row['Data']
        #expdesign = row['Exp_design']
        pvalue = str(row['P-value'])
        #comment = row['Comment']
        print(motif)
        image_file = path_motif + '../motif_figure/' + motif + '_nc.png'
        new_row = '<image overflow=\"visible\" width=\"299\" height=\"176\" xlink:href=\"'+image_file+'\"  ' \
                                                                                                      'transform=\"matrix(0.5184 0 0 0.5184 10.00 '+ str(float(100*i))+')\">\n</image>\n'
        image_file = path_motif + '../motif_figure/' + motif + '_rc.png'
        if os.path.isfile(image_file):
            new_row += '<image overflow=\"visible\" width=\"299\" height=\"176\" xlink:href=\"' + image_file + '\"  ' \
                                                                                                               'transform=\"matrix(0.5184 0 0 0.5184 170.00 '+ str(float(100*i))+')\">\n</image>\n'
        new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 340 '+ str(float(50+100*i))+')\" font-family=' \
                                                                                           '\"\'MyriadPro-Regular\'\" font-size=\"40\">'+data+'</text>\n'
        #new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 900 ' + str(float(50 + 100 * i)) + ')\" font-family=' \
        #                                                                                          '\"\'MyriadPro-Regular\'\" font-size=\"40\">' + expdesign + '</text>\n'
        new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 1300 '+ str(float(50+100*i))+')\" font-family=' \
                                                                                            '\"\'MyriadPro-Regular\'\" font-size=\"40\">'+pvalue+'</text>\n'
        #new_row += '<text transform=\"matrix(0.5184 0 0 0.5184 1000 ' + str(float(50 + 100 * i)) + ')\" font-family=' \
        #            '\"\'MyriadPro-Regular\'\" font-size=\"40\">' + comment + '</text>\n'
        svg_text += new_row
        i += 1
    svg_text += '</svg>\n'

    with open(motiflist_filename + '.svg', 'w') as figure_file:
        figure_file.write(svg_text)
    print('Convert svg to png')
    script = 'convert ' + motiflist_filename + '.svg ' + motiflist_filename + '.png'
    os.system(script)
    os.remove(motiflist_filename + '.txt')


def clean_motif_list(exp_design_list, path_motif, motif_file_index):
    '''
    Combine list of motifs in one big table
    :param exp_design_list:
    :param path_motif:
    :param motif_file_index:
    :return:
    '''
    # read first table
    path_table = path_motif + motif_file_index + exp_design_list[0] + '.txt'
    df_motif = pd.read_csv(path_table, sep='\t', index_col=0)
    df_motif['Exp_design'] = exp_design_list[0]
    df_motif = df_motif.astype(str)
    #for i in [1,len(exp_design_list)
    print(list(df_motif))
    print(df_motif.index)
    for i in range(1, len(exp_design_list)):
        path_table_2 = path_motif + motif_file_index + exp_design_list[i] + '.txt'
        df_motif_2 = pd.read_csv(path_table_2, sep='\t', index_col=0)
        df_motif_2['Exp_design'] = exp_design_list[i]
        df_motif_2 = df_motif_2.astype(str)
        for motif in df_motif_2.index:
            if motif in df_motif.index:
                for colname in df_motif.columns:
                    df_motif[colname][motif] += ';' + df_motif_2[colname][motif]
            else:
                df_motif = df_motif.append(pd.DataFrame(df_motif_2.loc[motif]).transpose(),sort=True)


    df_motif.to_csv(path_motif + 'Motif_Final.txt', sep='\t', header=True)

christophebecavin's avatar
christophebecavin committed
def run_fimo(path_analysis, exp_design_name, path_motif, path_motif_list, fimo_file):
    '''
    Creates a fimo.sh file with all the command to run
    :param path_analysis: path where the fasta files, background, and fimo results should be
    :param exp_design_name: name of the peaks : example: CecAm_Raw_1
    :param path_motif: path where the motif will be found
    :param motif_list_filename: path of the motif list
    :return: save file to path_motif + 'Fimo.sh'
    '''
    motif_list = list()
christophebecavin's avatar
christophebecavin committed
    with open(path_motif_list + '.txt', 'r') as motif_file:
        motif_file.readline()
        for row in motif_file:
            motif_list.append(row.split('\t')[0].strip())
Christophe  BECAVIN's avatar
Christophe BECAVIN committed
    data_name = exp_design_name
christophebecavin's avatar
christophebecavin committed
    with open(fimo_file, 'w') as motif_sh:
        for motif in motif_list:
            motif_file = path_motif + 'motif/' + motif + '.meme'
            print(motif_file)
            fasta_file = path_analysis + 'fasta/' + data_name + '.fasta'
            background_file = path_analysis + 'results/' + data_name + '/background'
christophebecavin's avatar
christophebecavin committed
            folder_fimo = path_motif + 'fimo/' + motif + '_' + data_name
Christophe  BECAVIN's avatar
Christophe BECAVIN committed
            fimo_sh = 'fimo --parse-genomic-coord --verbosity 1 --thresh 1e-2 --max-stored-scores 1000000 ' \
                      '--oc ' + folder_fimo + ' --bgfile ' + background_file + ' ' + motif_file + ' ' + fasta_file
            print(fimo_sh)
            motif_sh.write(fimo_sh+'\n')
def run_centrimo(path_analysis, exp_design_name, path_motif, motif_list_filename):
    '''
    Creates a centrimo.sh file with all the command to run
    :param path_analysis: path where the fasta files, background, and fimo results should be
    :param exp_design_name: name of the peaks : example: CecAm_Raw_1
    :param path_motif: path where the motif will be found
    :param motif_list_filename: path of the motif list
    :return: save file to path_motif + 'Centrimo.sh'
    '''
    motif_list = list()
    with open(path_motif + motif_list_filename + '.txt', 'r') as motif_file:
        motif_file.readline()
        for row in motif_file:
            motif_list.append(row.split('\t')[0].strip())
    data_name = exp_design_name

    with open(path_motif + 'Centrimo.sh', 'w') as motif_sh:
        for motif in motif_list:
christophebecavin's avatar
christophebecavin committed
            motif_file = path_analysis + 'motif/' + motif + '.meme'
            #print(motif_file)
            fasta_file = path_analysis + 'fasta/' + data_name + '.fasta'
            background_file = path_analysis + 'results/' + data_name + '/background'
            folder_centrimo = path_analysis + 'centrimo/' + motif + '_' + data_name
            centrimo_sh = 'centrimo -seqlen 150 -verbosity 1 -oc ' + folder_centrimo +\
                          ' --bgfile ' + background_file + ' -score 5.0 '\
                          '-ethresh 10.0 ' + fasta_file + ' '+motif_file
            print(centrimo_sh)
            motif_sh.write(centrimo_sh + '\n')



christophebecavin's avatar
christophebecavin committed
def create_motif_table(exp_design_name, path_motif, path_motif_list):
    '''
    Creates two tables summarizing occurence of motif for each peak
    :param path_analysis: path where the fasta files, background, and fimo results should be
    :param exp_design_name: name of the peaks : example: CecAm_Raw_1
    :param path_motif: path where the motif will be found
    :param motif_list_filename: path of the motif list
    :return: save file to path_analysis + motif_list_filename + '_score.txt'
                and path_analysis + motif_list_filename + '_count.txt'
    '''
    motif_list = list()
christophebecavin's avatar
christophebecavin committed
    with open(path_motif_list + '.txt', 'r') as motif_file:
        motif_file.readline()
        for row in motif_file:
            motif_list.append(row.split('\t')[0].strip())

    data_name = exp_design_name

    fasta_to_length = dict()
christophebecavin's avatar
christophebecavin committed
    print(path_analysis + 'fasta/' + exp_design_name + '.fasta' )
    fasta_seq = SeqIO.to_dict(SeqIO.parse(path_analysis + 'fasta/' + exp_design_name + '.fasta' , "fasta"))
    nb_peaks = len(fasta_seq.keys())
    np_motif = np.zeros((nb_peaks, len(motif_list)))
    df_motif_score = pd.DataFrame(np_motif, index=fasta_seq.keys(), columns=motif_list)
    np_motif = np.zeros((nb_peaks, len(motif_list)))
    df_motif_count = pd.DataFrame(np_motif, index=fasta_seq.keys(), columns=motif_list)
    np_motif = np.ones((nb_peaks, len(motif_list)))
    df_motif_pvalue = pd.DataFrame(np_motif, index=fasta_seq.keys(), columns=motif_list)

    for motif in motif_list:
        print(motif)
christophebecavin's avatar
christophebecavin committed
        df_motif_peaks = pd.read_csv(path_motif + 'fimo/' + motif + '_' + data_name +
                                     '/fimo.txt', index_col=0, sep='\t')
        grouped = df_motif_peaks[['sequence name','score']].groupby('sequence name')
        score = grouped.sum()
        df_motif_score[motif] = score
        counted = grouped.count()
        df_motif_count[motif] = counted
#           df_motif_pvalue[motif][peak] += pvalue
christophebecavin's avatar
christophebecavin committed
    df_motif_score.to_csv(path_motif_list + '_score.txt', sep='\t')
    df_motif_count.to_csv(path_motif_list + '_count.txt', sep='\t')

    #df_motif_pvalue.to_csv(path_motif + exp_design_name + '_' + motif_list_filename + '_pvalue.txt', sep='\t')



def get_size_fasta(path_analysis, exp_design_name):
    with open(path_analysis + 'Fasta_Summary.txt', 'w') as fasta_summary:
        for data_name in os.listdir(path_analysis+'fasta/'):
            if data_name.endswith('.fasta'):
                print(data_name)
                fasta_seq = SeqIO.to_dict(SeqIO.parse(path_analysis+'fasta/'+data_name, "fasta"))
                print(data_name,len(fasta_seq))
                fasta_summary.write(data_name.replace('.fasta','')+'\t'+str(len(fasta_seq))+'\n')


def motif_vs_fasta(path_analysis, exp_design_name, path_motif, motif_list_filename):
    '''
    Creates two tables summarizing occurence of motif for each fasta file
    :param path_analysis: path where the fasta files, background, and fimo results should be
    :param exp_design_name: name of the peaks : example: CecAm_Raw_1
    :param path_motif: path where the motif will be found
    :param motif_list_filename: path of the motif list
    :return: save file to path_analysis + motif_list_filename + '_score.txt'
                 and path_analysis + motif_list_filename + '_count.txt'
    '''
    # read fasta summary
    fasta_to_size = dict()
    with open(path_analysis + 'Fasta_Summary.txt', 'r') as fasta_summary:
        for row in fasta_summary:
            fasta_name = row.split('\t')[0]
            fasta_size = row.split('\t')[1].strip()
            if fasta_size != '0':
                fasta_to_size[fasta_name] = fasta_size
    # read list
    motif_list = list()
    with open(path_motif + motif_list_filename + '.txt', 'r') as motif_file:
        motif_file.readline()
        for row in motif_file:
            motif_list.append(row.split('\t')[0].strip())
    # read motif summary
    df_motif_score = pd.read_csv(path_analysis + motif_list_filename + '_score.txt', index_col=0, sep='\t')
    df_motif_count = pd.read_csv(path_analysis + motif_list_filename + '_count.txt', index_col=0, sep='\t')

    # create motif vs fasta tables
    nb_fasta = len(fasta_to_size.keys())
    np_fasta = np.zeros((len(motif_list), nb_fasta))
    df_motif_fasta_score = pd.DataFrame(np_fasta, index=motif_list, columns=fasta_to_size.keys())
    np_fasta = np.zeros((len(motif_list), nb_fasta))
    df_motif_fasta_count = pd.DataFrame(np_fasta, index=motif_list, columns=fasta_to_size.keys())

    print(motif_list)
    # go through all fasta file
    for fasta_name, fasta_size in fasta_to_size.items():
        fasta_peaks = SeqIO.to_dict(SeqIO.parse(path_analysis+'fasta/'+fasta_name+'.fasta', 'fasta'))
        #print(fasta_peaks.keys())
        for motif in motif_list:
            motif_score = df_motif_score.loc[fasta_peaks.keys(), motif].sum()
            df_motif_fasta_score[fasta_name][motif] = float(motif_score)/float(fasta_size)
            motif_count = df_motif_count.loc[fasta_peaks.keys(), motif].sum()
            df_motif_fasta_count[fasta_name][motif] = float(motif_count)/float(fasta_size)
    df_motif_fasta_score.to_csv(path_analysis + motif_list_filename + '_fasta_score.txt', sep='\t')
    df_motif_fasta_count.to_csv(path_analysis + motif_list_filename + '_fasta_count.txt', sep='\t')



#exp_design_name = 'LivOld'
christophebecavin's avatar
christophebecavin committed
#exp_design_name = 'LiverZT_MaxValues'
becavin christophe's avatar
becavin christophe committed
#exp_design_name = 'Liver_all_MaxMaxValues_1'
christophebecavin's avatar
christophebecavin committed
#exp_design_name = 'Liver_all_MaxMaxValues_3'
#exp_design_name = 'Cecum_all_MaxMaxValues_1'
christophebecavin's avatar
christophebecavin committed
#exp_design_name = 'Cecum_all_MaxMaxValues_3'
exp_design_name = 'Liver_all_T13_MaxMaxValues_3'
Christophe  BECAVIN's avatar
Christophe BECAVIN committed
#exp_design_name = 'LivOld_MaxValues'
#exp_design_name = 'CecAm_MaxValues'
christophebecavin's avatar
christophebecavin committed
pathTars = '/pasteur/projets/policy01/m6aAkker/'
path = '/Volumes/m6aAkker/'
path_analysis = path + 'PeakDiffExpression/' + exp_design_name + '/Motif/'

#path_motif = path_analysis
path_motif = path + 'PeakDiffExpression/Motif/'

#motif_list_filename = 'Motif_'+exp_design_name+'_count'
christophebecavin's avatar
christophebecavin committed
motif_list_filename = 'Motif_' + exp_design_name
path_motif_list = path_analysis + motif_list_filename

# Create all folders for motif analysis
# if not os.path.isdir(path_motif + 'logs/'):
#     os.mkdir(path_motif + 'logs/')
# if not os.path.isdir(path_motif + 'motif/'):
#     os.mkdir(path_motif + 'motif/')
# if not os.path.isdir(path_motif + 'motif_figure/'):
#     os.mkdir(path_motif + 'motif_figure/')
# if not os.path.isdir(path_motif + 'fimo/'):
#     os.mkdir(path_motif + 'fimo/')
# if not os.path.isdir(path_motif + 'centrimo/'):
#     os.mkdir(path_motif + 'centrimo/')

# first look at     log to see which software crashed
#parse_all_logs(path_analysis, exp_design_name)
# Parse meme results and extract all motifs
christophebecavin's avatar
christophebecavin committed
#parse_dreme(path_analysis, path_motif, exp_design_name)
# regroup all motifs
pvalue_cutoff = 0.00001
#regroup_motifs(path_motif, exp_design_name, pvalue_cutoff)
# Select motif centrally enriched -> Could be done automatically
#select_centrally_enriched(path_analysis, path_motif, exp_design_name)
# regroup all motifs for comparisons
# Remove all logs for each exp_design_name
pvalue_cutoff = 0.01
#regroup_motifs_diff(path_motif, exp_design_name, pvalue_cutoff)
christophebecavin's avatar
christophebecavin committed
# Select motif centrally enriched -> Could be done automatically
#select_centrally_enriched_diff(path_analysis, path_motif, exp_design_name)
# Manually filter motif list and put it in path_motif
# add new motif
#motif = 'RRACH'
Christophe  BECAVIN's avatar
Christophe BECAVIN committed
#motif="NGGACN"
#create_motif(path_motif, motif)
#motif = 'NBCAN'
#add_motif(path_motif, motif, motif_list_filename)
# combine list of motifs
exp_design_list = ['Liver_all_MaxMaxValues_1', 'Liver_all_MaxMaxValues_3',
                   'Cecum_all_MaxMaxValues_1', 'Cecum_all_MaxMaxValues_3']
motif_file_index = 'Motif_'
#clean_motif_list(exp_design_list, path_motif, motif_file_index)

# put all motifs figures together to filter the list
# nedd to install imagemagick !!!
motif_list_filename = 'Motif_Final'
#regroup_figures(path_motif, motif_list_filename)
christophebecavin's avatar
christophebecavin committed
path_analysis = pathTars + 'PeakDiffExpression/' + exp_design_name + '/Motif/'
#path_motif = path_analysis
path_motif = pathTars + 'PeakDiffExpression/Motif/'
#motif_list_filename = 'Motif_'+exp_design_name+'_count'
motif_list_filename = 'Motif_' + exp_design_name# + '_Diff'
path_motif_list = path + 'PeakDiffExpression/Motif/' + motif_list_filename
#fimo_file = path + 'PeakDiffExpression/Motif/' + 'Fimo_' + exp_design_name + '.sh'
#motif_list_filename = 'Motif_' + exp_design_name
#path_motif_list = path + 'PeakDiffExpression/Motif/' + motif_list_filename
fimo_file = path + 'PeakDiffExpression/Motif/' + 'Fimo_' + exp_design_name + '.sh'
# write sh file for running fimo
print(path_motif_list)
#run_fimo(path_analysis, exp_design_name, path_motif, path_motif_list, fimo_file)
christophebecavin's avatar
christophebecavin committed
# regroup everything in one .sh file
#print('cat '+path_motif+ '/Fimo_* > ' + path_motif + '/Fimo.sh')
# write sh file for running centrimo
#run_centrimo(path_analysis, exp_design_name, path_motif, motif_list_filename)

# Run create_motif_table.py
# python src/python/create_motif_table.py -p /pasteur/projets/policy01/m6aAkker/ -e Liver_all_MaxMaxValues_3 -m Motif_Liver_all_MaxMaxValues_3

# Using R run motif analysis for clustering of motif and biocondition
path_motif = path + 'PeakDiffExpression/Motif/'
#exp_design_name = 'Liver_all_T13_MaxMaxValues_3'
#exp_design_name = 'Liver_all_MaxMaxValues_1'
#exp_design_name = 'Liver_all_MaxMaxValues_3'
#exp_design_name = 'Cecum_all_MaxMaxValues_1'
exp_design_name = 'Cecum_all_withoutabxGF_MaxMaxValues_3'
exp_design_name = exp_design_name + ''

#######
# Run first motif_analysis.R to create '_list.txt'
#######
# for Liver
# index_files = {'_log2count_list','_log2count_Pos_list', '_log2count_Comp_list_T3','_log2count_Comp_list_T13',
#                '_log2count_Comp_list_T3T13','_log2count_Biocond_list',
#              '_fimoscore_list', '_fimoscore_Pos_list', '_fimoscore_Comp_list_T3', '_fimoscore_Comp_list_T13',
#                '_fimoscore_Comp_list_T3T13', '_fimoscore_Biocond_list'}
# for Cecum
index_files = {'_log2count_Pos_NoIntron_list', '_fimoscore_Pos_NoIntron_list'}
# index_files = {'_log2count_list','_log2count_Pos_list', '_log2count_Comp_list', '_log2count_Biocond_list',
#              '_fimoscore_list', '_fimoscore_Pos_list', '_fimoscore_Comp_list', '_fimoscore_Biocond_list'}
path_motif = path + 'PeakDiffExpression/Motif/' + 'Motif_' + exp_design_name + '/'
# use clustered list of motif to recreate figure
for index_file in index_files:
    motif_list_filename = 'Motif_' + exp_design_name + index_file
    figure_motif_clustering(path_motif, motif_list_filename)