diff --git a/.DS_Store b/.DS_Store index eddf048df7bdf3062d661508dc2c5d7649330bb9..2b9bae89bfec19fb150e5768c13485011dc026c1 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/Module/.DS_Store b/Module/.DS_Store index 19b82036d32618a951840f33d2c93ee7eff892ee..60012d079a89867047f0d7d9fb7fc36de744b8e2 100755 Binary files a/Module/.DS_Store and b/Module/.DS_Store differ diff --git a/Module/__pycache__/cost_matrix_uncertainty.cpython-39.pyc b/Module/__pycache__/cost_matrix_uncertainty.cpython-39.pyc index a217a1b2a57daeee20a7c2f30411d77c5736faac..a39af419af09ac84330c78ad6a8a94e5580258a5 100644 Binary files a/Module/__pycache__/cost_matrix_uncertainty.cpython-39.pyc and b/Module/__pycache__/cost_matrix_uncertainty.cpython-39.pyc differ diff --git a/Module/__pycache__/dynamic_preselection.cpython-39.pyc b/Module/__pycache__/dynamic_preselection.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e7a5d749d3ae53b8cb678e579961c50add26f61 Binary files /dev/null and b/Module/__pycache__/dynamic_preselection.cpython-39.pyc differ diff --git a/Module/__pycache__/monotonic_regression_uncertainty.cpython-39.pyc b/Module/__pycache__/monotonic_regression_uncertainty.cpython-39.pyc index 0c149e50c32e14944c7ad6e4e464d285e645b056..261caf75a92252ad4805cf09d5d7d5d480095b10 100644 Binary files a/Module/__pycache__/monotonic_regression_uncertainty.cpython-39.pyc and b/Module/__pycache__/monotonic_regression_uncertainty.cpython-39.pyc differ diff --git a/Module/__pycache__/optimal_k_aggregations.cpython-39.pyc b/Module/__pycache__/optimal_k_aggregations.cpython-39.pyc index 6e6a22c8bb46763329272e0b152ec47e7ee05a9b..278c9521ba1737d10a989dc280157324a77ae69f 100644 Binary files a/Module/__pycache__/optimal_k_aggregations.cpython-39.pyc and b/Module/__pycache__/optimal_k_aggregations.cpython-39.pyc differ diff --git a/Module/__pycache__/preselection.cpython-39.pyc b/Module/__pycache__/preselection.cpython-39.pyc index 335c7d27405aec0aade12dd43f1cd38612ce4370..4d41e46b2dd3c60aedce8685c63e36ef57eb1c95 100644 Binary files a/Module/__pycache__/preselection.cpython-39.pyc and b/Module/__pycache__/preselection.cpython-39.pyc differ diff --git a/Module/__pycache__/selection_algorithm.cpython-39.pyc b/Module/__pycache__/selection_algorithm.cpython-39.pyc index 79170c5987605a38fd82c88271acc26b149a7405..a0551de01deefc715028220dfa8fa706bcebaef8 100644 Binary files a/Module/__pycache__/selection_algorithm.cpython-39.pyc and b/Module/__pycache__/selection_algorithm.cpython-39.pyc differ diff --git a/Module/__pycache__/show_results_4cases.cpython-39.pyc b/Module/__pycache__/show_results_4cases.cpython-39.pyc index b60c307a3f8c7658a1ad25c34c2e454dc7179053..2b9b125d99e0defa046427be9eb4e541495beb70 100644 Binary files a/Module/__pycache__/show_results_4cases.cpython-39.pyc and b/Module/__pycache__/show_results_4cases.cpython-39.pyc differ diff --git a/Module/__pycache__/stages.cpython-39.pyc b/Module/__pycache__/stages.cpython-39.pyc index f1b1d8b631f5a1ddbd8bdaa280656de911f2e391..8619cd1ba1731d59207ef468dfe6812fdfd0bbfc 100644 Binary files a/Module/__pycache__/stages.cpython-39.pyc and b/Module/__pycache__/stages.cpython-39.pyc differ diff --git a/Module/__pycache__/tools.cpython-39.pyc b/Module/__pycache__/tools.cpython-39.pyc index 03b72c0092087876efdff4564d67f0237de44159..3335ebb09460a99f00dcbc32116d144a20c70a59 100644 Binary files a/Module/__pycache__/tools.cpython-39.pyc and b/Module/__pycache__/tools.cpython-39.pyc differ diff --git a/Module/cost_matrix_uncertainty.py b/Module/cost_matrix_uncertainty.py index 9ef409080eda4f19db6c9dc727fd0d9362204457..a04024b3505453a490c2b5e0661e849e6d6c59ee 100755 --- a/Module/cost_matrix_uncertainty.py +++ b/Module/cost_matrix_uncertainty.py @@ -10,7 +10,7 @@ from itertools import chain import copy -import multiprocessing as mp +import multiprocess as mp ### Useful functions for parallele @@ -22,6 +22,23 @@ def vals_mp(pairs, df_2, out, funct): return vals + +def monotonic_model_RE(p, df): + p1, p2, key = p.split('/') + key = int(key) + rev, up = tools.equiv_key_case(key) + tr1 = df[p1].values.tolist() + tr2 = df[p2].values.tolist() + diag = df['target'].values.tolist() + data = [((tr1[n], tr2[n] ), 1, diag[n]) for n in range(len(diag))] + X, m = mru.compute_recursion(data, (rev, up, key)) + reg, bpr, bpb, pr, pb = m[key] + return (reg, p) + + + + + #### ERROR MATRIX ###### @@ -79,6 +96,16 @@ def error_matrix(df_, pairs, nbcpus, funct): del df_2 + vals_re = [(c, df) for c in pairs] + res_re = pool.starmap(monotonic_model_RE, vals_re, max(1,len(vals)//nbcpus)) + + REd = {re[1] : re[0] for re in res_re} + REd['target'] = np.nan + re_s = pd.Series(REd) + re_s.name = 'RE' + + mat_err_re = pd.concat((mat_err,re_s.to_frame().T), axis=0) + unc = {col: mat_err[col].to_list().count(-1) for col in pairs} @@ -86,7 +113,7 @@ def error_matrix(df_, pairs, nbcpus, funct): unc_s = pd.Series(unc) unc_s.name = 'uncertain' - mat_err_unc = pd.concat((mat_err,unc_s.to_frame().T), axis=0) + mat_err_unc = pd.concat((mat_err_re,unc_s.to_frame().T), axis=0) @@ -103,12 +130,12 @@ def error_matrix(df_, pairs, nbcpus, funct): err = {col: mat_err[col].to_list().count(1)/(mat_err[col].to_list().count(1) + mat_err[col].to_list().count(0)) for col in pairs if col not in rem} err['target'] = np.nan err_s = pd.Series(err) - err_s.name = 'error' + err_s.name = 'LOOCV' mat_err_final = pd.concat((mat_err_unc,err_s.to_frame().T), axis=0) - mat_err_final.sort_values(axis = 1, by=['error', 'uncertain'], inplace=True) + mat_err_final.sort_values(axis = 1, by=['LOOCV', 'RE', 'uncertain'], inplace=True) del df return mat_err_final @@ -146,5 +173,5 @@ def error_to_prediction(matrix, df): def cost_classifiers(ndf): cols = list(ndf.columns) cols.remove('target') - cost = {cols[i] : ndf[cols[i]].loc[['error']][0] for i in range(len(cols))} + cost = {cols[i] : ndf[cols[i]].loc[['LOOCV']][0] for i in range(len(cols))} return cost diff --git a/Module/dynamic_preselection.py b/Module/dynamic_preselection.py new file mode 100644 index 0000000000000000000000000000000000000000..6c71fd6ab5a1cb616566f87f5f80b737bdd32919 --- /dev/null +++ b/Module/dynamic_preselection.py @@ -0,0 +1,491 @@ +import pandas as pd +import numpy as np +from random import shuffle +import pandas as pd +from Module import monotonic_regression_uncertainty as mru +from Module import tools +import heapq as hq +from copy import deepcopy + +import multiprocessing as mp + +import time + +def monotonic_model_RE(p, df): + p1, p2, key = p.split('/') + key = int(key) + rev, up = tools.equiv_key_case(key) + tr1 = df[p1].values.tolist() + tr2 = df[p2].values.tolist() + diag = df['target'].values.tolist() + data = [((tr1[n], tr2[n] ), 1, diag[n]) for n in range(len(diag))] + X, m = mru.compute_recursion(data, (rev, up, key)) + reg, bpr, bpb, pr, pb = m[key] + return (reg, p) + + +def H_df(df, cls, nbcpus): + pool = mp.Pool(nbcpus) + vals = [(c, df) for c in cls] + res = pool.starmap(monotonic_model_RE, vals, max(1,len(vals)//nbcpus)) + #f = open('logs_H_df.txt', 'a') + #f.write('H d_f {} \n'.format(res)) + #f.close() + return sorted(res) + + +def monotonic_model_LOOCV(p, df, maxi): + p1, p2, key = p.split('/') + key = int(key) + rev, up = tools.equiv_key_case(key) + tr1 = df[p1].values.tolist() + tr2 = df[p2].values.tolist() + diag = df['target'].values.tolist() + data = [((tr1[n], tr2[n] ), 1, diag[n]) for n in range(len(diag))] + + err = 0 + + + for d in data: + data_bis = deepcopy(data) + data_bis.remove(d) + X, m = mru.compute_recursion(data_bis, (rev, up, key)) + + + target = d[2] + out = d[0] + + reg, bpr, bpb, rps, bps = m[key] + pred = mru.predict_severe(out, bpr, bpb, rev, up) + + err += abs(target-pred) + if err > maxi: + return (maxi+1, p) + + return (err, p) + + +def Q_df(df, cls, nbcpus, maxi): + pool = mp.Pool(nbcpus) + vals = [(c, df, maxi) for c in cls] + return sorted(pool.starmap(monotonic_model_LOOCV, vals, max(1,len(vals)//nbcpus))) + + +def heapify(arr, n, i): + # Find the largest among root, left child and right child + largest = i + l = 2 * i + 1 + r = 2 * i + 2 + + if l < n and arr[i][0] < arr[l][0]: + largest = l + + if r < n and arr[largest][0] < arr[r][0]: + largest = r + + # Swap and continue heapifying if root is not largest + if largest != i: + arr[i], arr[largest] = arr[largest], arr[i] + heapify(arr, n, largest) + + +# Function to insert an element into the tree +def insert(array, newNum): + size = len(array)+1 + if size == 0: + array.append(newNum) + else: + array.append(newNum) + for i in range((size // 2) - 1, -1, -1): + heapify(array, size, i) + + +# Function to delete an element from the tree +def deleteNode(array, num): + size = len(array) + i = 0 + for i in range(0, size): + if num == array[i][0]: + break + + #print('array 1', array) + array[i], array[size - 1] = array[size - 1], array[i] + #print('array 2', array) + array.remove(array[size - 1]) + #print('array 3', array) + + for i in range((len(array) // 2) - 1, -1, -1): + heapify(array, len(array), i) + + +def deleteNodeNum(array, num): + size = len(array) + i = 0 + for i in range(0, size): + if num == array[i][0]: + break + + array[i], array[size - 1] = array[size - 1], array[i] + array.remove(array[size - 1]) + for i in range((len(array) // 2) - 1, -1, -1): + heapify(array, len(array), i) + +def deleteNodeName(array, name): + size = len(array) + i = 0 + for i in range(0, size): + if name == array[i][1]: + break + + array[i], array[size - 1] = array[size - 1], array[i] + array.remove(array[size - 1]) + + for i in range((len(array) // 2) - 1, -1, -1): + heapify(array, len(array), i) + + + +def keepPairs(Q, pair, q): + g1, g2, g3 = pair.split('/') + r1 = lookGene(Q, g1) + r2 = lookGene(Q, g2) + + f1 = True + f2 = True + + pairs = list() + + if r1[0]: + if q >= r1[1][0]: + f1 = False + pairs.append(r1[1]) + + if r2[0]: + if q >= r2[1][0]: + f2 = False + pairs.append(r2[1]) + + if f1 and f2: + return True, pairs + else: + return False, pairs + +def lookGene(Q, gene): + flag = False + pair_with_gene = None + for el in Q: + p1, p2, p3 = el[1].split('/') + if gene == p1 or gene == p2: + flag = True + pair_with_gene = el + break + return flag, pair_with_gene + + +def suppH(H, num): + j = 0 + while H[j][0] < num: + j+=1 + return H[:j] + + + + +def nb_de_genes(G): + s = set() + for k in G.keys(): + s = s.union(G[k]) + return len(s) + + +def H_dict(H): + Hd = dict() + for h in H: + kh = h[0] + if kh not in Hd.keys(): + Hd[kh] = list() + Hd[kh].append(h[1]) + return Hd + + +def Q_dict(Q): + Qd = dict() + G = dict() + for q in Q: + kq = q[0] + g1, g2, g3 = q[1].split('/') + if kq not in Qd.keys(): + Qd[kq] = set() + if kq not in G.keys(): + G[kq] = set() + Qd[kq].add(q[1]) + G[kq].add(g1) + G[kq].add(g2) + + + return Qd, G + +def update_dict(G, G_): + for key in G_.keys(): + if key not in G.keys(): + G[key] = G_[key] + else: + G[key] = G[key].union(G_[key]) + return G + + +def supp_H_above_a(H, a): + S = [k for k in H.keys() if k > a] + for s in S: + del H[s] + return H + +def supp_H_below_a(H, a): + S = [k for k in H.keys() if k <= a] + for s in S: + del H[s] + return H + + +def check_disjoint_pairs(Q, param): + dis_p = list() + genes = list() + + flag = False + for k in sorted(Q.keys()): + pairs = Q[k] + for p in pairs: + g1, g2, g3 = p.split('/') + if g1 not in genes and g2 not in genes: + dis_p.append(p) + genes.append(g1) + genes.append(g2) + if len(dis_p) >= param: + flag = True + break + print('Pairs with {} genes'.format(len(genes))) + return flag + +### Preselection based on the number of genes +def algorithm_1(cls, df, k, nbcpus, logs): + + t0 = time.time() + H = H_df(df, cls, nbcpus) + t1 = time.time() + + f = open(logs, 'a') + f.write('H computed in {} en len(H) = {}\n\n'.format(t1 - t0, len(H))) + f.close() + + + Hd = H_dict(H) + + Q = dict() #For each strat of LOOCV, we have a list of pairs + G = dict() #List of genes in Q + + count = 0 + + + + t2 = time.time() + + + for h_key in sorted(Hd.keys()): + + pairs = Hd[h_key] + Q_ = Q_df(df, pairs, nbcpus, max(Hd.keys())) + Qd, Gd = Q_dict(Q_) + + + G = update_dict(G, Gd) + Q = update_dict(Q, Qd) + + + if nb_de_genes(G) >=k: + break + + + f = open(logs, 'a') + f.write('Old G {}\n\n'.format(G.keys())) + f.close() + + a = max(Q.keys()) + G_ = deepcopy(G) + del G_[a] + while nb_de_genes(G) > k and nb_de_genes(G_) >= k: + G = G_ + del Q[a] + a = max(Q.keys()) + G_ = deepcopy(G) + del G_[a] + + f = open(logs, 'a') + f.write('New G {}\n\n'.format(G.keys())) + f.close() + + + + a = max(Q.keys()) #Highest value of LOOCV in Q + Hd = supp_H_above_a(Hd, a) + Hd = supp_H_below_a(Hd, h_key) + + + t3 = time.time() + f = open(logs, 'a') + f.write('S1 computed in {} and size pairs={}\n\n'.format(t3 - t2, Hd.keys())) + f.close() + f = open(logs, 'a') + f.write('G {}\n\n'.format(G.keys())) + f.close() + + + t4 = time.time() + + + for h_key in sorted(Hd.keys()): + f = open(logs, 'a') + f.write('H_key {}\n\n'.format(h_key)) + f.close() + a = max(Q.keys()) + if h_key <= a: + + pairs = Hd[h_key] + Q_ = Q_df(df, pairs, nbcpus, a) + Qd, Gd = Q_dict(Q_) + + Qd = supp_H_above_a(Qd, a) + + + + Gd = supp_H_above_a(Gd, a) + + + + G = update_dict(G, Gd) + Q = update_dict(Q, Qd) + + f = open(logs, 'a') + f.write('Old G {}\n\n'.format(G.keys())) + f.close() + + G_ = deepcopy(G) + del G_[a] + while nb_de_genes(G) > k and nb_de_genes(G_) >= k: + G = G_ + del Q[a] + a = max(Q.keys()) + G_ = deepcopy(G) + del G_[a] + + f = open(logs, 'a') + f.write('New G {}\n\n'.format(G.keys())) + f.close() + pairs = list() + for key in Q.keys(): + pairs += Q[key] + + + t5 = time.time() + f = open(logs, 'a') + f.write('S2 computed in {} and size pairs={}\n\n'.format(t5 - t4, len(pairs))) + f.close() + + return Q, pairs + + + + +## Preselection based on the number of pairs + +def algorithm_2(cls, df, m, nbcpus, logs): + + t0 = time.process_time() + H = H_df(df, cls, nbcpus) + t1 = time.process_time() + + f = open(logs, 'a') + f.write('H computed in {} en len(H) = {}\n\n'.format(t1 - t0, len(H))) + f.close() + + + Hd = H_dict(H) + + Q = dict() #For each strat of LOOCV, we have a list of pairs + + + count = 0 + + + t2 = time.process_time() + + + for h_key in sorted(Hd.keys()): + + pairs = Hd[h_key] + Q_ = Q_df(df, pairs, nbcpus, max(Hd.keys())) + Qd, Gd = Q_dict(Q_) + + Q = update_dict(Q, Qd) + + + if check_disjoint_pairs(Q, m): + break + + + a = max(Q.keys()) #Highest value of LOOCV in Q + Hd = supp_H_above_a(Hd, a) + Hd = supp_H_below_a(Hd, h_key) + + + t3 = time.process_time() + f = open(logs, 'a') + f.write('S1 computed in {}, H size pairs={}, Q size pairs = {}\n\n'.format(t3 - t2, Hd.keys(), Q.keys())) + f.close() + + t4 = time.process_time() + + for h_key in sorted(Hd.keys()): + a = max(Q.keys()) + if h_key <= a: + + pairs = Hd[h_key] + Q_ = Q_df(df, pairs, nbcpus, a) + Qd, Gd = Q_dict(Q_) + + Qd = supp_H_above_a(Qd, a) + + Q = update_dict(Q, Qd) + + Q_ = deepcopy(Q) + del Q_[a] + + while check_disjoint_pairs(Q_, m): + + Q = Q_ + a = max(Q.keys()) + + Q_ = deepcopy(Q) + del Q_[a] + + pairs = list() + for key in Q.keys(): + pairs += Q[key] + + t5 = time.process_time() + f = open(logs, 'a') + f.write('S2 computed in {} and size pairs={}\n\n'.format(t5 - t4, len(pairs))) + f.close() + + return Q, pairs + + +def all_configurations(df): + transcripts = list(df.columns) + transcripts.remove('target') + + configurations = list() + for i in range(len(transcripts)): + for j in range(i+1, len(transcripts)): + for key in range(1,5): + configurations.append('/'.join([transcripts[i], transcripts[j], str(key)])) + return configurations diff --git a/Module/optimal_k_aggregations.py b/Module/optimal_k_aggregations.py index fe0c4504e7531b46aba97b2ec1d275cd48168112..a0ca5821f75e1120dfc952d56b8242bce005deea 100755 --- a/Module/optimal_k_aggregations.py +++ b/Module/optimal_k_aggregations.py @@ -66,7 +66,7 @@ def create_and_predict_metamodel(df_, out, pairs, nbcpus, funct): def k_missclassification(df, cls, nbcpus, funct, strat, min_k, max_k, log): print('k misclassification : {}\n'.format(funct)) - k_mis = {k : list() for k in range(3, max_k)} #Store for each value of k, whereas patients were misclassified or not with an ensemble of k classifiers + k_mis = {k : list() for k in range(min_k, max_k)} #Store for each value of k, whereas patients were misclassified or not with an ensemble of k classifiers #pairs_err = {cl : list() for cl in cls} #For each classifiers we are going to store the average misclassification error (computed with LOOCV) made with each patients diff --git a/Module/selection_algorithm.py b/Module/selection_algorithm.py index 84716056a479f22549770fc154c8df323393bc68..0dce58c1d828970c5caa8e77723f02fa8fcf76f3 100755 --- a/Module/selection_algorithm.py +++ b/Module/selection_algorithm.py @@ -43,7 +43,7 @@ def filter_pairs_adapt(pairs, cl): def NB(df, ndf_, cost, k, nbcpus, mes = ms.MVE): ndf = copy.deepcopy(ndf_) - ndf.drop(['uncertain', 'error'], inplace=True) + ndf.drop(['uncertain', 'LOOCV'], inplace=True) pairs = sorted(cost.items(), key=lambda t: t[1]) pairs = [pairs[i][0] for i in range(len(pairs))] @@ -73,7 +73,7 @@ def FS(df, ndf_, cost, k, nbcpus, mes = ms.MVE, jump = 30): pool = mp.Pool(nbcpus) ndf = copy.deepcopy(ndf_) - ndf.drop(['uncertain', 'error'], inplace=True) + ndf.drop(['uncertain', 'LOOCV'], inplace=True) temp = min(cost.values()) res = [key for key in cost.keys() if cost[key] == temp] #Many classifiers can have the lowest error @@ -134,7 +134,7 @@ def BS(df, ndf_, cost, k, nbcpus, mes = ms.F2, end = 30): pool = mp.Pool(nbcpus) ndf = copy.deepcopy(ndf_) - ndf.drop(['uncertain', 'error'], inplace=True) + ndf.drop(['uncertain', 'LOOCV'], inplace=True) pairs = sorted(cost.items(), key=lambda t: t[1]) diff --git a/Module/show_results_4cases.py b/Module/show_results_4cases.py index f83fc86d1914c8f9311a16d0cd660cfb8b62c5af..25d9795838200b05e7b9b26be29576b92ccc6527 100755 --- a/Module/show_results_4cases.py +++ b/Module/show_results_4cases.py @@ -76,13 +76,13 @@ def print_model(data, models, p1, p2, df1, pathname = None, cm=None): for bp in bpb: x, y = bp if key == 1: - ax.add_artist(patches.Rectangle((min_x, min_y), abs(x-min_x), abs(y-min_y), facecolor = 'lightskyblue', zorder = 1)) + ax.add_artist(patches.Rectangle((min_x, min_y), abs(x-min_x), abs(y-min_y), facecolor = 'lightsteelblue', zorder = 1)) elif key == 2: - ax.add_artist(patches.Rectangle((x, min_y), max_x+abs(x), abs(y-min_y), facecolor = 'lightskyblue', zorder = 1)) + ax.add_artist(patches.Rectangle((x, min_y), max_x+abs(x), abs(y-min_y), facecolor = 'lightsteelblue', zorder = 1)) elif key == 3: - ax.add_artist(patches.Rectangle((x, y), max_x+abs(x), max_y+abs(y), facecolor = 'lightskyblue', zorder = 1)) + ax.add_artist(patches.Rectangle((x, y), max_x+abs(x), max_y+abs(y), facecolor = 'lightsteelblue', zorder = 1)) else: - ax.add_artist(patches.Rectangle((min_x, y ), abs(x-min_x), max_y+abs(y), facecolor = 'lightskyblue', zorder = 1)) + ax.add_artist(patches.Rectangle((min_x, y ), abs(x-min_x), max_y+abs(y), facecolor = 'lightsteelblue', zorder = 1)) for bp in bpr: @@ -106,16 +106,18 @@ def print_model(data, models, p1, p2, df1, pathname = None, cm=None): for d in data: if d[2] == 0: - plt.scatter(d[0][0], d[0][1], c = 'blue', marker='.', zorder = 2) + plt.scatter(d[0][0], d[0][1], c = 'royalblue', marker='.', zorder = 2) elif d[2] == 1: - plt.scatter(d[0][0], d[0][1], c = 'red', marker='.', zorder = 2) + plt.scatter(d[0][0], d[0][1], c = 'firebrick', marker='.', zorder = 2) plt.xlabel(g1) plt.ylabel(g2) if cm is not None: pair = '/'.join([p1, p2, str(key)]) - error = cm.at['error', pair] - plt.title('LOOCV Error {}'.format(error)) + error = cm.at['LOOCV', pair] + plt.title('RE = {} & LOOCVE = {}'.format(round(reg_err/len(data),3),round(error,3))) + else: + plt.title('RE = {}'.format(round(reg_err,3))) if pathname is not None: plt.savefig(pathname + g1 + '_' + g2 + '.png') @@ -346,7 +348,6 @@ def print_model_RS(data, models, p1, p2, df1, pathname = None, cm=None): plt.figure(figsize=(3,3)) ax = plt.axes() ax.set_facecolor("lightgray") - plt.title('Ranking Space') plt.xlabel(p1) plt.ylabel(p2) @@ -396,6 +397,10 @@ def print_model_RS(data, models, p1, p2, df1, pathname = None, cm=None): plt.scatter(rxd, ryd, c='red', marker='.', zorder=2) elif d[2] == 0: plt.scatter(rxd, ryd, c='blue', marker='.', zorder=2) + if cm is not None: + pair = '/'.join([p1, p2, str(key)]) + error = cm.at['LOOCV', pair] + plt.title('Ranking Space \n LOOCV Error {}'.format(round(error,3))) plt.show() @@ -496,3 +501,13 @@ def show_results(df, probs_df, pairs, nbcpus, pathname, cm): for r in res: p1, p2, models, data = r print_model(data, models, p1, p2, probs_df, pathname, cm) + +def show_results_RS(df, probs_df, pairs, nbcpus, pathname, cm): + pool = mp.Pool(nbcpus) + vals = [(p, df) for p in pairs] + + res = pool.starmap(cr_models, vals, max(1,len(vals)//nbcpus) ) + + for r in res: + p1, p2, models, data = r + print_model_RS(data, models, p1, p2, probs_df, pathname, cm) diff --git a/Module/stages.py b/Module/stages.py index 24ab16b211d792e7beffe4156d7682c48f5d2e37..1fc83501fed88676e416dd82d8c3ec10a9141520 100755 --- a/Module/stages.py +++ b/Module/stages.py @@ -5,6 +5,8 @@ from Module import tools from Module import selection_algorithm as sa from Module import preselection as psl +from Module import dynamic_preselection as dpsl + import pandas as pd import matplotlib.pyplot as plt import os @@ -13,13 +15,19 @@ import os from sklearn.metrics import roc_auc_score, confusion_matrix, matthews_corrcoef -def stage0(df, nbcpus, threshold): +def stage0_old(df, nbcpus, threshold): reg = psl.regression_error_matrix(df, nbcpus) if threshold is not None: reg = psl.preselection_reg_err(reg, threshold) return reg +def stage0(df, nbcpus, m, log): + config = dpsl.all_configurations(df) + Q, pairs = dpsl.algorithm_2(config, df, m, nbcpus, log) + return pairs + + def stage_1(df, cls, min_k, max_k, nbcpus, strat, funct, log): @@ -74,9 +82,9 @@ def stage_2(df, cls, k_opt, auc_file, conf_mat_file, nbcpus, funct, strat, logs) labels, probas, uncertain_pts = tools.unclassified_points(y_true, y_proba) return acc, auc, CI, conf_mat -def stage_3(df, cls, k_opt, nbcpus, funct, strat): +def stage_3(df, cls, k_opt, cost_mat, nbcpus, funct, strat): ndf_err = cmu.error_matrix(df, cls, nbcpus,funct) - ndf_err.to_csv('cm_st3.csv') + ndf_err.to_csv(cost_mat) cost = cmu.cost_classifiers(ndf_err) mve, pairs, algo = oka.find_k_metamodel(df, ndf_err, cost, k_opt, nbcpus, strat) return pairs, mve diff --git a/README.md b/README.md index a9e194b83218e10b7f6a2dc22f3978da4e6845fb..9bc3c913ab9cf1da7e5ce7feef2b113b080fc917 100644 --- a/README.md +++ b/README.md @@ -1,92 +1,40 @@ -# MEM_Python +# Monotonic Ensemble Models +Author: Océane FOURQUET +This project aims to construct an ensemble model of bidimensional monotonic classifiers[1](https://link.springer.com/article/10.1007/s00453-012-9628-4). In addition to reimplementing an established approach[2](https://academic.oup.com/jid/article/217/11/1690/4911472?login=true) in python, it integrates a preselection of the pairs of features, reducing drastically the running time of the approach. -## Getting started +## Python and librairies versions +- Python 3.9.1 +- Pandas 1.2.3 +- Numpy 1.19.2 +- Matplotlib 3.3.4 -To make it easy for you to get started with GitLab, here's a list of recommended next steps. +## Run the code -Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)! +### Data +Data should be presented in csv format, in the form of samples per row and features per column, with a 'target' column for classes. -## Add your files +### Code +The code is split in 3 stages. Stage 1: determine the optimal number of classifiers to construct the metamodel. Stage 2: compute an estimate of the AUC score of a metamodel constructed with k_opt classifiers. Stage 3: construct the final metamodel from the whole dataset. The three stages are coded in the py file stages.py, available in the Module. -- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files -- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command: +To run the whole project, you can run full\_project.py with as follow: -``` -cd existing_repo -git remote add origin https://gitlab.pasteur.fr/ofourque/mem_python.git -git branch -M main -git push -uf origin main -``` +python3 full\_project.py <dataset> <probeset> <outpath> -## Integrate with your tools +ex : python3 full\_project.py dengue/dengue.csv dengue/probeset_annotations.txt Result/ > log.txt -- [ ] [Set up project integrations](https://gitlab.pasteur.fr/ofourque/mem_python/-/settings/integrations) +To run only one stage, you can use the relevant py file among stage1.py, stage2.py and stage3.py. -## Collaborate with your team +Note that by default, this code uses the 3-classes classification (severe, non severe and uncertain). If you want to compute the metamodel by favoring severe or non severe (2-classes classification), you must change the parameter in the file full\_project.py. -- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/) -- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html) -- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically) -- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/) -- [ ] [Automatically merge when pipeline succeeds](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html) -## Test and Deploy - -Use the built-in continuous integration in GitLab. - -- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html) -- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing(SAST)](https://docs.gitlab.com/ee/user/application_security/sast/) -- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html) -- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/) -- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html) - -*** - -# Editing this README - -When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thank you to [makeareadme.com](https://www.makeareadme.com/) for this template. - -## Suggestions for a good README -Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information. - -## Name -Choose a self-explaining name for your project. - -## Description -Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors. - -## Badges -On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge. - -## Visuals -Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method. - -## Installation -Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection. - -## Usage -Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README. - -## Support -Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc. - -## Roadmap -If you have ideas for releases in the future, it is a good idea to list them in the README. - -## Contributing -State if you are open to contributions and what your requirements are for accepting them. - -For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self. - -You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser. - -## Authors and acknowledgment -Show your appreciation to those who have contributed to the project. - -## License -For open source projects, say how it is licensed. - -## Project status -If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers. +## Files in Module +- cost\_matrix\_uncertainty.py +- measures.py +- monotonic\_regression\_uncertainty.py +- optimal\_k\_aggregations.py +- selection\_algorithm.py +- show\_results.py +- stages.py +- tools.py