From 311daf730778acfbdbc1d271248e2ef8eac98b3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9lanie=20=20HENNART?= <melanie.hennart@pasteur.fr> Date: Tue, 29 May 2018 15:54:11 +0200 Subject: [PATCH] Upload New File --- LINcoding.py | 527 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 527 insertions(+) create mode 100644 LINcoding.py diff --git a/LINcoding.py b/LINcoding.py new file mode 100644 index 0000000..040666d --- /dev/null +++ b/LINcoding.py @@ -0,0 +1,527 @@ +#!/usr/bin/env python3.6 +# -*- coding: utf-8 -*- +""" +@Author: Melanie hennart +@PASTEUR_2018 +@Python_3.6 +""" + + +#=============================================================================# +#================================= README ==================================# +#=============================================================================# +""" +Example : + +- Create DataBase LIN : +>>>python LINcoding.py -i BIGSdb.txt -t 1-2 -l 3- -b 1,20,45,66,80,90,95,96,98,99,100 + +- Update DataBase LIN : +>>>python LINcoding.py -i new_Ecoli.txt -r BIGSdb.lin -t 1-2 -l 3- +_______________________________________________________________________________ + +- Create DataBase LIN + prefix tree without length of branches (newick) : +>>>python LINcoding.py -i BIGSdb.txt -t 1-2 -l 3- -b 1,20,45,66,80,90,95,96,98,99,100 -tree Y + +- Create DataBase LIN + prefix tree with length of branches (newick) : +>>>python LINcoding.py -i BIGSdb.txt -t 1-2 -l 3- -b 1,20,45,66,80,90,95,96,98,99,100 -tree Y -lg Y +""" +#=============================================================================# + +#========= IMPORT OF LIBRARIES ===============================================# + +import numpy as np +import copy +import argparse +import itertools +import random +import re + +#=============================================================================# +#====== Functions for LIN code : LIN CODE REFERENCE ==========================# +#=============================================================================# + +def read_file_tsv (name, liste_id = "", liste_loci = ""): + """ + Read allelic profiles file. + """ + + def selection_col (ligne, indice_selec) : + col_selec = [] + for i in indice_selec.split(","): + if "-" in i : + if i[-1] == "-" : + col_selec.extend(ligne.split("\t")[int(i[:-1])-1:]) + elif i[0] == "-" : + col_selec.extend(ligne.split("\t")[:int(i[1:])]) + else : + a, b = int(i.split("-")[0]), int(i.split("-")[1]) + col_selec.extend(ligne.split("\t")[a-1:b]) + else : + col_selec.append(ligne.split("\t")[int(i)-1]) + return col_selec + + file = open(name, "r") + mat_profils = [] + mat_name_profils = [] + for i, ligne in enumerate(file): + ligne = ligne.rstrip("\n") + if liste_id == "" : + if i == 0 : + col_id = ligne.split("\t")[0] + col_loci = ligne.split("\t")[1:] + else : + mat_name_profils.append(ligne.split("\t")[0]) + mat_profils.append(ligne.split("\t")[1:]) + elif liste_loci == "" : + if i == 0 : + col_id = selection_col (ligne, liste_id) + indice = ligne.split("\t").index(col_id[-1]) + col_loci = ligne.split("\t")[indice+1:] + else : + mat_name_profils.append(selection_col (ligne, liste_id)) + mat_profils.append(ligne.split("\t")[indice+1:]) + elif liste_id != "" or liste_loci != "" : + if i == 0 : + col_id = selection_col (ligne, liste_id) + col_loci = selection_col (ligne, liste_loci) + else : + mat_name_profils.append(selection_col (ligne, liste_id)) + mat_profils.append(selection_col (ligne, liste_loci)) + file.close() + mat_int = [] + for ligne in mat_profils: + m_ligne = [] + for element in ligne : + try : m_ligne.append(int(element)) + except ValueError: m_ligne.append(0) + mat_int.append(m_ligne) + return mat_name_profils, np.array(mat_int, dtype = int), col_id, col_loci + +def dist_similarite (p1, p2, k): + """ + Calculates a distance between 2 profiles. + Input : Allelic profiles 1, Allelic profiles 2, length profiles + Output : Similarity and Dissimilarity + """ + with np.errstate(divide="ignore", invalid="ignore"): + c = np.true_divide(p1,p2) + c[c == np.inf] = 0 + c = np.nan_to_num(c) + compt = np.count_nonzero(c==0) + simi = np.count_nonzero(c==1) + l = (k - compt) + if l != 0 : + return simi / l * 100 , (0.00001 + k - compt - simi ) / l * 100 + else : + return 0.0, 100.0 + +def matrice_similarite_total (mat_profils): + """ + Calculates a distance matrix. + Input : Allelic profiles + Output : Distance matrix of similarity and dissimilarity. + """ + mat_s = np.ones((len(mat_profils),len(mat_profils)))*100 + mat_d = np.zeros((len(mat_profils),len(mat_profils))) + k = len(mat_profils[0]) + for p1, p2 in itertools.combinations(range(len(mat_profils)), 2): + mat_s [p1][p2], mat_d [p1][p2] = dist_similarite (mat_profils[p1], mat_profils[p2], k) + mat_s [p2][p1] = mat_s [p1][p2] + mat_d [p2][p1] = mat_d [p1][p2] + return mat_s, mat_d + +def sort_name_profil (mat_profils, mat_name_profils, mat_dis): + """ + Prim's algorithm. + Input : Allelic profiles, profiles names, Distance matrix + Output : Ordered allelic profiles, ordered profiles names and index list Ordered. + """ + M = np.copy(mat_dis) + for i in range(len(M)): + M[i][i] = 100 + ind = np.argmin(M) + x, y = int (ind / len(mat_dis)), ind - int (ind / len(mat_dis)) * len(mat_dis) + ordre_profil = [x,y] + mat_name_profils_sort = [mat_name_profils[x], mat_name_profils[y]] + mat_profils_sort = [mat_profils[x], mat_profils[y]] + M[x][y] = M[y][x] = 100 + l = len(ordre_profil) + while len(ordre_profil) != len(M): + v_min = np.argmin([np.min(M[x]) for x in ordre_profil]) + k = [np.argmin(M[x]) for x in ordre_profil][v_min] + for i in ordre_profil: + M[i][k] = M[k][i] = 100 + if k not in ordre_profil: + ordre_profil.append(k) + mat_name_profils_sort.append(mat_name_profils[k]) + mat_profils_sort.append(mat_profils[k]) + if l == len(ordre_profil) : + ind = np.argmin(M) + x, y = int (ind / len(mat_dis)), ind - int (ind / len(mat_dis)) * len(mat_dis) + ordre_profil.extend([x,y]) + mat_name_profils_sort.extend([mat_name_profils[x], mat_name_profils[y]]) + mat_profils_sort.extend([mat_profils[x], mat_profils[y]]) + M[x][y] = M[y][x] = 100 + l = len(ordre_profil) + return mat_name_profils_sort, np.array(mat_profils_sort, dtype=int), ordre_profil + +def LIN_Code_REF (mat_name_profils, mat_s, ordre, bins): + """ + Algorithm LIN. + Input : profil name, Distance matrix and all bins + Output : Update "mat_name_profils" with code LIN + """ + for i, name_profil in enumerate(mat_name_profils): + if i == 0 : + name_profil.append([0]*len(bins)) + mat_profils_update = [ordre[0]] + else : + argmax_ = np.argmax([mat_s[ordre[i]][x] for x in mat_profils_update]) + s_max = np.max([mat_s[ordre[i]][x] for x in mat_profils_update]) + LIN = copy.deepcopy(mat_name_profils[argmax_][-1]) + if s_max != 100 : + max_ = 0 + ind_bin = [i for i,x in enumerate(bins) if x > s_max][0] + for lin in mat_name_profils[:i]: + if lin[-1][:ind_bin] == LIN[:ind_bin] : + if max_ < lin[-1][ind_bin]: + max_ = lin[-1][ind_bin] + LIN[ind_bin] = max_ + 1 + LIN[ind_bin + 1 :] = [0] * (len(LIN)- ind_bin -1) + name_profil.append(LIN) + mat_profils_update.append(ordre[i]) + return + +#=============================================================================# +#=============================================================================# + +def write_file (file_name_ref, file_name_isolats, mat_name_profils, mat_profils, col_id, col_loci, bins_threshold): + """ + Write 2 files : .lin (LIN + allelic profils) and .txt (isolates names + LIN) + """ + file_Ref = open(file_name_ref, "w") + file_Iso = open(file_name_isolats, "w") + file_Ref.write(str(bins_threshold) + "\t" + "\t".join(col_loci) + "\n") + file_Iso.write("\t".join(col_id) + "\t" + str(bins_threshold) + "\n") + for i, isolat in enumerate(mat_name_profils): + str_LIN = "_".join([str(i) for i in isolat[-1]]) + file_Ref.write(str_LIN + "\t" + "\t".join([str(x) for x in mat_profils[i]]) + "\n") + file_Iso.write("\t".join(isolat[:-1])+"\t"+ str_LIN + "\n") + file_Ref.close() + file_Iso.close() + return + +#=============================================================================# +#====== Functions for LIN code : LIN CODE NEW ISOLATES =======================# +#=============================================================================# + +def read_file_lin (name): + """ + Read Database. + """ + file = open(name, "r") + LIN = [] + mat_profils = [] + for i, ligne in enumerate(file): + ligne = ligne.rstrip("\n") + if i == 0 : + bins = [float(x) for x in ligne.split("\t")[0][1:-1].split(",")] + col_loci = ligne.split("\t")[1:] + else : + LIN.append([int(x) for x in ligne.split("\t")[0].split("_")]) + mat_profils.append(ligne.split("\t")[1:]) + file.close() + mat_profils = np.array(mat_profils) + np.place(mat_profils, mat_profils == "", 0) + mat_profils = mat_profils.astype(int) + return bins, col_loci, LIN, mat_profils + + +def check_up_loci (col_loci_ref, col_loci_iso, mat_profils_iso): + """ + Order columns of loci as the Database. + """ + if col_loci_ref != col_loci_iso : + mat_profils_iso_sort = np.zeros(np.shape(mat_profils_iso), dtype = int) + for i, loci_ref in enumerate(col_loci_ref) : + if loci_ref in col_loci_iso : + indice = col_loci_iso.index(loci_ref) + mat_profils_iso_sort[:,i] = mat_profils_iso [:,indice] + else : + mat_profils_iso_sort[:,i] = [0] * len(mat_profils_iso_sort) + else : + mat_profils_iso_sort = mat_profils_iso + return mat_profils_iso_sort + + +def matrice_similarite_ref_iso (mat_profils_ref, mat_profils_iso): + """ + Calculates a distance matrix between profils new and Database. + Input : Allelic profiles of Database and Allelic profiles new + Output : Distance matrix of similarity and dissimilarity. + """ + mat_s = np.ones((len(mat_profils_iso),len(mat_profils_ref)))*100 + mat_d = np.zeros((len(mat_profils_iso),len(mat_profils_ref))) + k = len(mat_profils_ref[0]) + for i, p1 in enumerate(mat_profils_iso) : + for j , p2 in enumerate(mat_profils_ref) : + mat_s [i][j], mat_d [i][j] = dist_similarite (p1, p2, k) + return mat_s, mat_d + + +def LIN_Code_NEW (mat_LIN, mat_name_profils_iso, bins, mat_s ) : + """ + Algorithm LIN. + Input : + Output : + """ + for i, name_profil_iso in enumerate(mat_name_profils_iso): + argmax_ , s_max = mat_s[i].argmax(), mat_s[i].max() + LIN = copy.deepcopy(mat_LIN[argmax_]) + if s_max != 100 : + ind_bin = [i for i,x in enumerate(bins) if x > s_max][0] + max_ = 0 + for lin in mat_LIN: + if lin[:ind_bin] == LIN[:ind_bin] : + if max_ < lin[ind_bin]: + max_ = lin[ind_bin] + LIN[ind_bin] = max_ + 1 + LIN[ind_bin + 1 :] = [0] * (len(LIN)- ind_bin -1) + name_profil_iso.append(LIN) + mat_LIN.append(LIN) + return + +#=============================================================================# +#=============================================================================# + +def read_file_sort (name) : + """ + Read the file containing the names profiles ordered written by the user. + """ + file = open(name, "r") + mat_name_profils_read = [] + for i, ligne in enumerate(file): + ligne = ligne.rstrip("\n") + if i == 0 : col_id_read = ligne.split("\t") + else: mat_name_profils_read.append(ligne.split("\t")) + return col_id_read, mat_name_profils_read + + +def sort_name (mat_name_profils_read, mat_name_profils, mat_profils): + """ + Sort according to user's order. + """ + mat_profils_read = [] + mat_name_profils_non_read = [] + mat_profils_non_read = [] + for i, name_profil in enumerate(mat_name_profils) : + if name_profil in mat_name_profils_read : + mat_profils_read.append(mat_profils[i]) + else : + mat_name_profils_non_read.append(name_profil) + mat_profils_non_read.append(mat_profils[i]) + mat_profils_read.extend(mat_profils_non_read) + mat_name_profils_read.extend(mat_name_profils_non_read) + return mat_name_profils_read, np.array(mat_profils_read) + +#=============================================================================# +#====== Creation of the tree : STRUCTURE =====================================# +#=============================================================================# + +def sort_LIN(tuple_): + return tuple_[-1] + +def partition(mat_name_profils, i, S, bins) : + groupe = [] + for j, ind in enumerate(S) : + if j == 0 : + val = mat_name_profils[ind][-1][i] + sous_grpe = [] + sous_grpe.append(ind) + else : + if mat_name_profils[ind][-1][i] == val : sous_grpe.append(ind) + else : + val = mat_name_profils[ind][-1][i] + groupe.append(sous_grpe) + sous_grpe = [] + sous_grpe.append(ind) + groupe.append(sous_grpe) + return groupe + +def Ne(l, mat_name_profils, i, S, bins) : + a = partition(mat_name_profils, i, S, bins) + if i == 0 : + l[str(S)] = (a, [100-bins[i]]*len(a), 100-bins[i]) + for j, k in enumerate(a) : + if type(k) == list and len(k) > 1 and i < len(bins) - 1 : + t = Ne(l, mat_name_profils, i+1, k, bins) + if t[0] != k : + if i != len(bins) - 2 : + l[str(k)] = (t, [100-bins[i+1]]*len(t), 100-bins[i+1]) + else : + l[str(k)] = (t, [(bins[i+1] - bins[i])/10]*len(t), 100-bins[i+1]) + elif i == len(bins) - 1 and len(k) > 1: + r = str(k)[1:-1].replace(" ","").split(",") + r ="["+",".join(["["+x+"]"+":0.0" for x in r])+"]" + l[str(k)] = r + return a + +#=============================================================================# +#====== Creation of the tree without length of branches ======================# +#=============================================================================# + +def create(l, d, m) : + for i, el in enumerate(d): + if type(el) == list and str(el) in l.keys(): + d[i] = l[str(el)][0] + for j, k in enumerate(d): + if type(d[j]) == list : create(l, d[j], m+1) + return d + +def newick (arbre, mat_name_profils) : + arbre = str(arbre) + arbre = arbre.replace(" ","") + arbre = arbre.replace("]",")") + arbre = arbre.replace("[","(") + expression = re.compile("\([^,)(]*\)") + res = expression.findall(arbre) + for i in res : arbre = arbre.replace(i,i[1:-1]) + expression = re.compile("[0-9]+") + res = expression.findall(arbre) + for i in reversed(res): + d = arbre.index(i) + f = d + len(i) + arbre = arbre[:d] +"_".join(mat_name_profils[int(i)][:-1])+arbre[f:] + return arbre +";" + +#=============================================================================# +#====== Creation of the tree with length of branches =========================# +#=============================================================================# + +def g(l, mat_name_profils): + for key in l.keys(): + for i, valeur in enumerate(l[key][0]) : + if str(valeur) in l.keys() : + try : l[key][1][i] = l[key][1][i] - l[str(valeur)][-1] + except : None + m = copy.deepcopy(l) + for key in l.keys(): + ss = "" + if type(l[key]) != str : + for i , valeur in enumerate(l[key][0]): + ss += str(valeur)+":"+str(l[key][1][i]) + if i != len(l[key][0])-1 : ss += "," + m[key] = "["+ss+"]" + for key in m.keys(): + for k, valeur in l.items() : + if key in m[k] : m[k] = m[k].replace(key, m[key]) + arb = m[str(list(range(len(mat_name_profils))))] + expression = re.compile("\[[0-9]+\]") + res = expression.findall(arb) + for i in reversed(res): + arb = arb.replace(i,"_".join(mat_name_profils[int(i[1:-1])][:-1])) + arb = arb.replace("]",")") + arb = arb.replace("[","(") + return arb +";" + +#========= MAIN PROGRAM =====================================================# + +#=== Parameters +parser = argparse.ArgumentParser() + +parser.add_argument("-i", dest="fichier_tsv", type=str, required=True, help="file name : file of profiles isolates") + +parser.add_argument("-r", dest="fichier_lin", type=str, help="file name .lin") +parser.add_argument("-s", dest="fichier_txt", type=str, help="file name to sort strains") +parser.add_argument("-ns", dest="sort", type=str, help="Y for no sort strains") +parser.add_argument("-a", dest="alea", type=str, help="Y for ordre aleatoire") + +parser.add_argument("-t", dest="id", type=str, help="numero(s) column(s) name isolats, ex : 0-2,6") +parser.add_argument("-l", dest="loci", type=str, help="numero(s) column(s) name isolats, ex : 2-265,300") + +parser.add_argument("-b", dest="bins", type=str, help="bins, ex : 5,20,80,95,100") +parser.add_argument("-o", dest="fichier_sorti", type=str, help="file out") + +parser.add_argument("-tree", dest="t", type=str, required=True, help="create tree : Y ") +parser.add_argument("-lg", dest="len_branche", type=str, help="Y => longueur des branches") + +args = parser.parse_args() + +fichier_tsv = args.fichier_tsv +fichier_lin = args.fichier_lin +fichier_txt = args.fichier_txt +no_sort = args.sort +alea = args.alea +tree = args.t +branche = args.len_branche +indice_id = ("" if args.id == None else args.id) +indice_loci = ("" if args.loci == None else args.loci) +file_name = args.fichier_sorti + +if file_name is not None : + file_name_ref = file_name + ".lin" + file_name_isolats = "LIN_" + file_name + ".txt" +else: + file_name_ref = fichier_tsv[:-3] + "lin" + file_name_isolats = "LIN_" + fichier_tsv[:-3]+ "txt" + + +#=== Algorithm +if fichier_lin is None : + bins_threshold = [float(x) for x in args.bins.split(",")] + mat_name_profils, mat_profils, col_id, col_loci = read_file_tsv (fichier_tsv, indice_id, indice_loci) + + if fichier_txt is not None : + col_id_read, mat_name_profils_read = read_file_sort (fichier_txt) + if col_id_read != col_id : + print ("Warning") + else: + mat_name_profils, mat_profils = sort_name (mat_name_profils_read, mat_name_profils, mat_profils) + + mat_s, mat_d = matrice_similarite_total (mat_profils) + ordre_profil = list(range(len(mat_name_profils))) + + if no_sort is None and fichier_txt is None: + if alea is None : + mat_name_profils, mat_profils, ordre_profil = sort_name_profil (mat_profils, mat_name_profils, mat_d) + else : + random.shuffle(ordre_profil) + mat_name_profils = [mat_name_profils[x] for x in ordre_profil] + mat_profils = np.array([mat_profils[x] for x in ordre_profil], dtype = int) + + LIN_Code_REF (mat_name_profils, mat_s, ordre_profil, bins_threshold) + write_file (file_name_ref, file_name_isolats, mat_name_profils, mat_profils, col_id, col_loci, bins_threshold) + +else : + bins, col_loci_ref, LIN, mat_profils_ref = read_file_lin (fichier_lin) + + mat_name_profils_iso, mat_profils_iso, col_id_iso, col_loci_iso = read_file_tsv (fichier_tsv, indice_id, indice_loci) + + mat_profils_iso = check_up_loci (col_loci_ref, col_loci_iso, mat_profils_iso) + mat_s, mat_d = matrice_similarite_ref_iso (mat_profils_ref, mat_profils_iso) + LIN_Code_NEW (LIN, mat_name_profils_iso, bins, mat_s ) + + write_file (file_name_ref, file_name_isolats, mat_name_profils_iso, mat_profils_iso, col_id_iso, col_loci_ref, bins) + +#=== Create a tree if user asks +if tree is not None and fichier_lin is None : + mat_name_profils.sort(key=sort_LIN) + l = {} + Ne(l, mat_name_profils, 0, list(range(len(mat_name_profils))), bins_threshold) + if branche == "Y" : + arb_lg = g(l, mat_name_profils) + f = open("ARBRE_REF.txt", "w") + f.write(arb_lg +"\n") + f.close() + else : + m = copy.deepcopy(l) + for key in l.keys(): + if type(m[key]) == str : del m[key] + d = create(m, [list(range(len(mat_name_profils)))], 0) + arb = newick (d[0], mat_name_profils) + f = open("Arbres.txt", "a") + f.write(arb +"\n") + f.close() + + \ No newline at end of file -- GitLab