diff --git a/Module/__pycache__/cost_matrix_uncertainty.cpython-39.pyc b/Module/__pycache__/cost_matrix_uncertainty.cpython-39.pyc index a39af419af09ac84330c78ad6a8a94e5580258a5..eaa56ea6a45882d764e09782f1633c6338042275 100644 Binary files a/Module/__pycache__/cost_matrix_uncertainty.cpython-39.pyc and b/Module/__pycache__/cost_matrix_uncertainty.cpython-39.pyc differ diff --git a/Module/__pycache__/dynamic_preselection.cpython-39.pyc b/Module/__pycache__/dynamic_preselection.cpython-39.pyc index 6e7a5d749d3ae53b8cb678e579961c50add26f61..114f2d5e86efdad995af1333c3fbf2261be7052f 100644 Binary files a/Module/__pycache__/dynamic_preselection.cpython-39.pyc and b/Module/__pycache__/dynamic_preselection.cpython-39.pyc differ diff --git a/Module/__pycache__/monotonic_regression_uncertainty.cpython-39.pyc b/Module/__pycache__/monotonic_regression_uncertainty.cpython-39.pyc index 261caf75a92252ad4805cf09d5d7d5d480095b10..df45ec257726d79ba4d83283b4c71ef2b3ac62b7 100644 Binary files a/Module/__pycache__/monotonic_regression_uncertainty.cpython-39.pyc and b/Module/__pycache__/monotonic_regression_uncertainty.cpython-39.pyc differ diff --git a/Module/__pycache__/optimal_k_aggregations.cpython-39.pyc b/Module/__pycache__/optimal_k_aggregations.cpython-39.pyc index 278c9521ba1737d10a989dc280157324a77ae69f..0795f9292d3c7fd7fb975537ea5cd593c57a13a3 100644 Binary files a/Module/__pycache__/optimal_k_aggregations.cpython-39.pyc and b/Module/__pycache__/optimal_k_aggregations.cpython-39.pyc differ diff --git a/Module/__pycache__/show_results.cpython-39.pyc b/Module/__pycache__/show_results.cpython-39.pyc index 2a77523ab471eb02977f182b7253a9182f0e5c99..2dd8754d800151e48f662369dc523fc144bf777a 100644 Binary files a/Module/__pycache__/show_results.cpython-39.pyc and b/Module/__pycache__/show_results.cpython-39.pyc differ diff --git a/Module/__pycache__/stages.cpython-39.pyc b/Module/__pycache__/stages.cpython-39.pyc index 8619cd1ba1731d59207ef468dfe6812fdfd0bbfc..b8af48a39981829c186d7e93d4c7b8b26babe952 100644 Binary files a/Module/__pycache__/stages.cpython-39.pyc and b/Module/__pycache__/stages.cpython-39.pyc differ diff --git a/Module/__pycache__/tools.cpython-39.pyc b/Module/__pycache__/tools.cpython-39.pyc index 3335ebb09460a99f00dcbc32116d144a20c70a59..cff49bba8b77852765176cdb3db435a582510586 100644 Binary files a/Module/__pycache__/tools.cpython-39.pyc and b/Module/__pycache__/tools.cpython-39.pyc differ diff --git a/Module/cost_matrix_uncertainty.py b/Module/cost_matrix_uncertainty.py index a04024b3505453a490c2b5e0661e849e6d6c59ee..ae057496e6fe8ab66fcf2a2d932a95c2404a2d58 100755 --- a/Module/cost_matrix_uncertainty.py +++ b/Module/cost_matrix_uncertainty.py @@ -10,7 +10,7 @@ from itertools import chain import copy -import multiprocess as mp +import multiprocessing as mp ### Useful functions for parallele diff --git a/Module/dynamic_preselection.py b/Module/dynamic_preselection.py index 6c71fd6ab5a1cb616566f87f5f80b737bdd32919..9fdeeca4f533bd8e525b7dd0edcb77bd687eae95 100644 --- a/Module/dynamic_preselection.py +++ b/Module/dynamic_preselection.py @@ -28,9 +28,7 @@ def H_df(df, cls, nbcpus): pool = mp.Pool(nbcpus) vals = [(c, df) for c in cls] res = pool.starmap(monotonic_model_RE, vals, max(1,len(vals)//nbcpus)) - #f = open('logs_H_df.txt', 'a') - #f.write('H d_f {} \n'.format(res)) - #f.close() + return sorted(res) @@ -108,11 +106,11 @@ def deleteNode(array, num): if num == array[i][0]: break - #print('array 1', array) + array[i], array[size - 1] = array[size - 1], array[i] - #print('array 2', array) + array.remove(array[size - 1]) - #print('array 3', array) + for i in range((len(array) // 2) - 1, -1, -1): heapify(array, len(array), i) @@ -263,19 +261,15 @@ def check_disjoint_pairs(Q, param): if len(dis_p) >= param: flag = True break - print('Pairs with {} genes'.format(len(genes))) return flag ### Preselection based on the number of genes -def algorithm_1(cls, df, k, nbcpus, logs): +def algorithm_1(cls, df, k, nbcpus): t0 = time.time() H = H_df(df, cls, nbcpus) t1 = time.time() - f = open(logs, 'a') - f.write('H computed in {} en len(H) = {}\n\n'.format(t1 - t0, len(H))) - f.close() Hd = H_dict(H) @@ -305,10 +299,6 @@ def algorithm_1(cls, df, k, nbcpus, logs): break - f = open(logs, 'a') - f.write('Old G {}\n\n'.format(G.keys())) - f.close() - a = max(Q.keys()) G_ = deepcopy(G) del G_[a] @@ -319,9 +309,6 @@ def algorithm_1(cls, df, k, nbcpus, logs): G_ = deepcopy(G) del G_[a] - f = open(logs, 'a') - f.write('New G {}\n\n'.format(G.keys())) - f.close() @@ -330,22 +317,9 @@ def algorithm_1(cls, df, k, nbcpus, logs): Hd = supp_H_below_a(Hd, h_key) - t3 = time.time() - f = open(logs, 'a') - f.write('S1 computed in {} and size pairs={}\n\n'.format(t3 - t2, Hd.keys())) - f.close() - f = open(logs, 'a') - f.write('G {}\n\n'.format(G.keys())) - f.close() - - - t4 = time.time() for h_key in sorted(Hd.keys()): - f = open(logs, 'a') - f.write('H_key {}\n\n'.format(h_key)) - f.close() a = max(Q.keys()) if h_key <= a: @@ -364,9 +338,6 @@ def algorithm_1(cls, df, k, nbcpus, logs): G = update_dict(G, Gd) Q = update_dict(Q, Qd) - f = open(logs, 'a') - f.write('Old G {}\n\n'.format(G.keys())) - f.close() G_ = deepcopy(G) del G_[a] @@ -377,19 +348,11 @@ def algorithm_1(cls, df, k, nbcpus, logs): G_ = deepcopy(G) del G_[a] - f = open(logs, 'a') - f.write('New G {}\n\n'.format(G.keys())) - f.close() pairs = list() for key in Q.keys(): pairs += Q[key] - t5 = time.time() - f = open(logs, 'a') - f.write('S2 computed in {} and size pairs={}\n\n'.format(t5 - t4, len(pairs))) - f.close() - return Q, pairs @@ -397,15 +360,12 @@ def algorithm_1(cls, df, k, nbcpus, logs): ## Preselection based on the number of pairs -def algorithm_2(cls, df, m, nbcpus, logs): +def algorithm_2(cls, df, m, nbcpus): t0 = time.process_time() H = H_df(df, cls, nbcpus) t1 = time.process_time() - f = open(logs, 'a') - f.write('H computed in {} en len(H) = {}\n\n'.format(t1 - t0, len(H))) - f.close() Hd = H_dict(H) @@ -438,9 +398,7 @@ def algorithm_2(cls, df, m, nbcpus, logs): t3 = time.process_time() - f = open(logs, 'a') - f.write('S1 computed in {}, H size pairs={}, Q size pairs = {}\n\n'.format(t3 - t2, Hd.keys(), Q.keys())) - f.close() + t4 = time.process_time() @@ -472,9 +430,7 @@ def algorithm_2(cls, df, m, nbcpus, logs): pairs += Q[key] t5 = time.process_time() - f = open(logs, 'a') - f.write('S2 computed in {} and size pairs={}\n\n'.format(t5 - t4, len(pairs))) - f.close() + return Q, pairs diff --git a/Module/monotonic_regression_uncertainty.py b/Module/monotonic_regression_uncertainty.py index 48cb10f1ea20d7b878b99abeb19c685da8f44159..88b43b6b7d5d27150dbdae2e04e6d36f7440b9c0 100755 --- a/Module/monotonic_regression_uncertainty.py +++ b/Module/monotonic_regression_uncertainty.py @@ -56,7 +56,6 @@ def Z_(H, A, ind_leaves): for i in range(1,len(H)+1): v_i = ind_leaves[i] Z.append(int(compute_Z(A,v_i))) - print('Z', Z) return Z def is_A_balanced(A): @@ -176,7 +175,7 @@ def step1(ind_cg, A, nb_leaves, ind_leaves, err1, S, H): else: A[ind_leaves[ind_cg]-1] = mini + err1 degenerate_interval(A, ind_leaves[ind_cg], ind_leaves) - #print('Step1', time.process_time() - t0) + @@ -241,7 +240,7 @@ def step2(A, v, err0, err1): #add the error in left and right intervals update_all_right(v, A, err0) update_all_left(v, A, err1) - #print('Step2', time.process_time() - t0) + @@ -615,13 +614,13 @@ def compute_recursion(data, case = None): models[4] = (reg_err, bpr, bpb, r_p, b_p) else: - #print('case {}'.format(case)) + rev, up = case[0], case[1] X, H, A, ind_leaves = initialization(data, rev) S = list() labs = [x[2] for x in X] nb_leaves = len(H) - #print(X) + for i in range(len(X)): #((r,c), w, label) x = X[i] xy, w, lab = x diff --git a/Module/optimal_k_aggregations.py b/Module/optimal_k_aggregations.py index a0ca5821f75e1120dfc952d56b8242bce005deea..20b72185f8aefb2b5a95aa1260bc63aa24962e03 100755 --- a/Module/optimal_k_aggregations.py +++ b/Module/optimal_k_aggregations.py @@ -63,19 +63,17 @@ def create_and_predict_metamodel(df_, out, pairs, nbcpus, funct): ##Big loop to compute the average error for ensemble contaning k classifiers -def k_missclassification(df, cls, nbcpus, funct, strat, min_k, max_k, log): - print('k misclassification : {}\n'.format(funct)) +def k_missclassification(df, cls, nbcpus, funct, strat, min_k, max_k): - k_mis = {k : list() for k in range(min_k, max_k)} #Store for each value of k, whereas patients were misclassified or not with an ensemble of k classifiers + + k_mis = {k : list() for k in range(min_k, max_k+1)} #Store for each value of k, whereas patients were misclassified or not with an ensemble of k classifiers #pairs_err = {cl : list() for cl in cls} #For each classifiers we are going to store the average misclassification error (computed with LOOCV) made with each patients for j in range(len(df)): t1 = time.time() - f = open(log, 'a') - f.write('Patient {} \n'.format(j)) - f.close() + out = df.iloc[j, :] df_2 = df.drop([j]) df_2.reset_index(drop=True, inplace=True) @@ -83,11 +81,6 @@ def k_missclassification(df, cls, nbcpus, funct, strat, min_k, max_k, log): ndf_err = cmu.error_matrix(df_2, cls, nbcpus,funct) - f = open(log, 'a') - f.write('cost matrix computed \n') - f.close() - - cost = cmu.cost_classifiers(ndf_err) @@ -107,7 +100,7 @@ def k_missclassification(df, cls, nbcpus, funct, strat, min_k, max_k, log): min_k +=1 #k > 1: ensemble classifiers - for k in range(min_k, max_k): + for k in range(min_k, max_k+1): t0 = time.time() mve, pairs, algo = find_k_metamodel(df_2, ndf_err, cost, k, nbcpus, strat) pred, proba = create_and_predict_metamodel(df_2, out, pairs, nbcpus, funct) @@ -117,16 +110,7 @@ def k_missclassification(df, cls, nbcpus, funct, strat, min_k, max_k, log): else: #case of proba = -1 means that all the classifiers in the metamodel predicted the point as uncertain print('case of unknown point in oka') - f = open(log, 'a') - f.write('Creation metamodel for k = {} in {}s \n'.format(k, time.time()-t0)) - f.close() - - f = open(log, 'a') - f.write('End patient in {}s \n'.format(time.time() - t1)) - f.close() - - - k_error = {k : np.mean(k_mis[k]) for k in range(min_k, max_k)} + k_error = {k : np.mean(k_mis[k]) for k in range(min_k, max_k+1)} return k_error diff --git a/Module/preselection.py b/Module/preselection.py deleted file mode 100644 index caffa4d1d8179d42d7cbc36e1f88ad84231aa130..0000000000000000000000000000000000000000 --- a/Module/preselection.py +++ /dev/null @@ -1,59 +0,0 @@ -from Module import cost_matrix_uncertainty as cmu -from Module import optimal_k_aggregations as oka -from Module import monotonic_regression_uncertainty as mru -from Module import tools -from Module import selection_algorithm as sa - -import multiprocessing as mp - -import pandas as pd -import matplotlib.pyplot as plt -import os - - -def all_configurations(df): - transcripts = list(df.columns) - transcripts.remove('target') - - configurations = list() - for i in range(len(transcripts)): - for j in range(i+1, len(transcripts)): - for key in range(1,5): - configurations.append('/'.join([transcripts[i], transcripts[j], str(key)])) - return configurations - - -def single_score(cl, df): - p1, p2, key = cl.split('/') - key = int(key) - rev, up = tools.equiv_key_case(key) - - diag = df['target'].to_list() - tr1, tr2 = df[p1].to_list(), df[p2].to_list() - data = [((tr1[i], tr2[i]), 1, diag[i]) for i in range(len(diag))] - - - X, m = mru.compute_recursion(data, (rev, up, key)) - reg_err, bpr, bpb, r_p, b_p = m[key] - - return (cl, reg_err) - -def regression_error_score(df, cls, nbcpus): - pool = mp.Pool(nbcpus) - - vals = [(cl, df) for cl in cls] - res = pool.starmap(single_score, vals, max(1, len(vals)//nbcpus)) - - dico = {r[0] : r[1] for r in res} - s = pd.Series(dico) - s.name = 'regression error' - return s - -def regression_error_matrix(df, nbcpus): - config = all_configurations(df) - reg = regression_error_score(df, config, nbcpus) - return reg - - -def preselection_reg_err(reg_err_mat, threshold): - return reg_err_mat[reg_err_mat<threshold] diff --git a/Module/show_results_4cases.py b/Module/show_results.py similarity index 58% rename from Module/show_results_4cases.py rename to Module/show_results.py index 25d9795838200b05e7b9b26be29576b92ccc6527..335ba85b82fd6a530718f5e8ca84ef2359930bcd 100755 --- a/Module/show_results_4cases.py +++ b/Module/show_results.py @@ -131,10 +131,9 @@ def print_model(data, models, p1, p2, df1, pathname = None, cm=None): -def print_model_log(data, models, p1, p2, df1, pathname = None): - #print the monotonic space with the 3 areas - data = [((log(d[0][0]), log([0][1])), d[1], d[2]) for d in data] +def print_model_out(data, out, models, p1, p2, df1, pathname = None): + #print the monotonic space with the 3 areas if '.1.1' in p1: p1 = p1[:-2] @@ -152,8 +151,8 @@ def print_model_log(data, models, p1, p2, df1, pathname = None): for key in models.keys(): key = int(key) - #print('Key', key) - plt.figure(figsize=(3,3)) + print('Key', key) + plt.figure(figsize=(5,5)) ax = plt.axes() ax.set_facecolor("lightgray") @@ -199,8 +198,8 @@ def print_model_log(data, models, p1, p2, df1, pathname = None): ax.add_artist(patches.Rectangle((x, 0), 1000, y, facecolor = 'lightcoral', zorder = 1)) - #plt.scatter(x_r, y_r, c = 'blue', marker='.', zorder = 2) - #plt.scatter(x_b, y_b, c = 'red', marker='.', zorder = 2) + #plt.scatter(x_r, y_r, c = 'blue', zorder = 2) + #plt.scatter(x_b, y_b, c = 'red', zorder = 2) random.shuffle(data) @@ -209,6 +208,7 @@ def print_model_log(data, models, p1, p2, df1, pathname = None): plt.scatter(d[0][0], d[0][1], c = 'blue', marker='.', zorder = 2) elif d[2] == 1: plt.scatter(d[0][0], d[0][1], c = 'red', marker='.', zorder = 2) + plt.scatter(out[0], out[1], c = 'g', zorder = 2) plt.xlabel(g1) plt.ylabel(g2) @@ -224,29 +224,18 @@ def print_model_log(data, models, p1, p2, df1, pathname = None): -def print_model_out(data, out, models, p1, p2, df1, pathname = None): - #print the monotonic space with the 3 areas - if '.1.1' in p1: - p1 = p1[:-2] - if '.1.1' in p2: - p2 = p2[:-2] - try: - g1 = df1[df1['Probeset_ID'] == p1]['Gene.Symbol'].values.tolist()[0] - except: - g1 = p1 - try: - g2 = df1[df1['Probeset_ID'] == p2]['Gene.Symbol'].values.tolist()[0] - except: - g2 = p2 + +def print_model_positive(data, models, p1, p2, pathname, cm): + for key in models.keys(): key = int(key) - print('Key', key) - plt.figure(figsize=(5,5)) + #print('Key', key) + plt.figure(figsize=(3,3)) ax = plt.axes() - ax.set_facecolor("lightgray") + ax.set_facecolor("lightcoral") x_r = list() y_r = list() @@ -264,169 +253,57 @@ def print_model_out(data, out, models, p1, p2, df1, pathname = None): reg_err, bpr, bpb, r_p, b_p = models[key] - for bp in bpb: - x, y = bp - if key == 1: - ax.add_artist(patches.Rectangle((0.0, 0.0), x, y, facecolor = 'lightskyblue', zorder = 1)) - elif key == 2: - ax.add_artist(patches.Rectangle((x, 0), 1000, y, facecolor = 'lightskyblue', zorder = 1)) - elif key == 3: - ax.add_artist(patches.Rectangle((x, y), 1000, 1000, facecolor = 'lightskyblue', zorder = 1)) - else: - ax.add_artist(patches.Rectangle((0, y ), x, 1000, facecolor = 'lightskyblue', zorder = 1)) + min_x = min(min(x_r), min(x_b))-0.05 + min_y = min(min(y_r), min(y_b))-0.05 + max_x = max(max(x_r), max(x_b))+0.05 + max_y = max(max(y_r), max(y_b))+0.05 - for bp in bpr: + for bp in bpb: x, y = bp - - if key == 1: - ax.add_artist(patches.Rectangle((x, y), 1000, 1000, facecolor ='lightcoral', zorder = 1)) + ax.add_artist(patches.Rectangle((min_x, min_y), abs(x-min_x), abs(y-min_y), facecolor = 'lightsteelblue', zorder = 1)) elif key == 2: - ax.add_artist(patches.Rectangle((0, y ), x, 1000, facecolor = 'lightcoral', zorder = 1)) + ax.add_artist(patches.Rectangle((x, min_y), max_x+abs(x), abs(y-min_y), facecolor = 'lightsteelblue', zorder = 1)) elif key == 3: - ax.add_artist(patches.Rectangle((0.0, 0.0), x, y, facecolor = 'lightcoral', zorder = 1)) + ax.add_artist(patches.Rectangle((x, y), max_x+abs(x), max_y+abs(y), facecolor = 'lightsteelblue', zorder = 1)) else: - ax.add_artist(patches.Rectangle((x, 0), 1000, y, facecolor = 'lightcoral', zorder = 1)) + ax.add_artist(patches.Rectangle((min_x, y ), abs(x-min_x), max_y+abs(y), facecolor = 'lightsteelblue', zorder = 1)) + - #plt.scatter(x_r, y_r, c = 'blue', zorder = 2) - #plt.scatter(x_b, y_b, c = 'red', zorder = 2) random.shuffle(data) for d in data: if d[2] == 0: - plt.scatter(d[0][0], d[0][1], c = 'blue', marker='.', zorder = 2) + plt.scatter(d[0][0], d[0][1], c = 'royalblue', marker='.', zorder = 2) elif d[2] == 1: - plt.scatter(d[0][0], d[0][1], c = 'red', marker='.', zorder = 2) - plt.scatter(out[0], out[1], c = 'g', zorder = 2) - plt.xlabel(g1) - plt.ylabel(g2) - - if pathname is not None: - plt.savefig(pathname + g1 + '_' + g2 + '.png') - - f = open(pathname + 'gene.txt', 'a') - f.write('{} : {}\n'.format(g1, p1)) - f.write('{} : {}\n'.format(g2, p2)) - f.close() - else: - plt.show() - - - -def rank_h(x, D, rev): - h = [d[0] for d in D] - h = sorted(h, reverse=rev) - r = h.index(x) - - if rev: - return len(D) - r + 1 - else: - return r+1 - -def rank_v(x, D, up): - h = [d[1] for d in D] - h.sort() - r = h.index(x) - #if up: - return r+1 - #else: - # return len(D) - r + 1 - - - - - - - -def print_model_RS(data, models, p1, p2, df1, pathname = None, cm=None): - D_hat = [x[0] for x in data] - - - plt.figure(figsize=(3,3)) - ax = plt.axes() - ax.set_facecolor("lightgray") - plt.xlabel(p1) - plt.ylabel(p2) - - for key in models.keys(): - key = int(key) - - rev, up = tools.equiv_key_case(key) - - reg, bpr, bpb, rps, bps = models[key] - - - - for bp in bpb: - x, y = bp - rxd = rank_h(x, D_hat, rev) - ryd = rank_v(y, D_hat, up) - - if key == 1: - ax.add_artist(patches.Rectangle((0.0, 0.0), rxd, ryd, facecolor = 'lightskyblue', zorder = 1)) - elif key == 2: - ax.add_artist(patches.Rectangle((rxd, 0), 1000, ryd, facecolor = 'lightskyblue', zorder = 1)) - elif key == 3: - ax.add_artist(patches.Rectangle((rxd, ryd), 1000, 1000, facecolor = 'lightskyblue', zorder = 1)) - else: - ax.add_artist(patches.Rectangle((0, ryd ), rxd, 1000, facecolor = 'lightskyblue', zorder = 1)) - - - for bp in bpr: - x, y = bp - rxd = rank_h(x, D_hat, rev) - ryd = rank_v(y, D_hat, up) - - if key == 1: - ax.add_artist(patches.Rectangle((rxd, ryd), 1000, 1000, facecolor ='lightcoral', zorder = 1)) - elif key == 2: - ax.add_artist(patches.Rectangle((0, ryd ), rxd, 1000, facecolor = 'lightcoral', zorder = 1)) - elif key == 3: - ax.add_artist(patches.Rectangle((0.0, 0.0), rxd, ryd, facecolor = 'lightcoral', zorder = 1)) - else: - ax.add_artist(patches.Rectangle((rxd, 0), 1000, ryd, facecolor = 'lightcoral', zorder = 1)) + plt.scatter(d[0][0], d[0][1], c = 'firebrick', marker='.', zorder = 2) + plt.xlabel(p1) + plt.ylabel(p2) - random.shuffle(data) - for d in data: - rxd = rank_h(d[0][0], D_hat, rev) - ryd = rank_v(d[0][1], D_hat, up) - if d[2] == 1: - plt.scatter(rxd, ryd, c='red', marker='.', zorder=2) - elif d[2] == 0: - plt.scatter(rxd, ryd, c='blue', marker='.', zorder=2) if cm is not None: pair = '/'.join([p1, p2, str(key)]) error = cm.at['LOOCV', pair] - plt.title('Ranking Space \n LOOCV Error {}'.format(round(error,3))) - - plt.show() - -def print_severe(data, models, p1, p2, df1, pathname = None): - #print the monotonic space with the 3 areas + plt.title('RE = {} & LOOCVE = {}'.format(round(reg_err/len(data),3),round(error,3))) + else: + plt.title('RE = {}'.format(round(reg_err/len(data),3))) - if '.1.1' in p1: - p1 = p1[:-2] - if '.1.1' in p2: - p2 = p2[:-2] + if pathname is not None: + name = pathname + '/' + str(p1) + '_' + str(p2) + '.pdf' + plt.savefig(name.replace('//', '/'), bbox_inches='tight') + else: + plt.show() - try: - g1 = df1[df1['Probeset_ID'] == p1]['Gene.Symbol'].values.tolist()[0] - except: - g1 = p1 - try: - g2 = df1[df1['Probeset_ID'] == p2]['Gene.Symbol'].values.tolist()[0] - except: - g2 = p2 +def print_model_negative(data, models, p1, p2, pathname, cm): for key in models.keys(): key = int(key) - print('Key', key) - plt.figure(figsize=(5,5)) + #print('Key', key) + plt.figure(figsize=(3,3)) ax = plt.axes() - ax.set_facecolor("lightcoral") + ax.set_facecolor("lightsteelblue") x_r = list() y_r = list() @@ -444,55 +321,63 @@ def print_severe(data, models, p1, p2, df1, pathname = None): reg_err, bpr, bpb, r_p, b_p = models[key] - for bp in bpb: + + min_x = min(min(x_r), min(x_b))-0.1 + min_y = min(min(y_r), min(y_b))-0.1 + max_x = max(max(x_r), max(x_b))+0.1 + max_y = max(max(y_r), max(y_b))+0.1 + + + for bp in bpr: x, y = bp + + if key == 1: - ax.add_artist(patches.Rectangle((0.0, 0.0), x, y, facecolor = 'lightskyblue', zorder = 1)) + ax.add_artist(patches.Rectangle((x, y), max_x+abs(x), max_y+abs(y), facecolor ='lightcoral', zorder = 1)) elif key == 2: - ax.add_artist(patches.Rectangle((x, 0), 1000, y, facecolor = 'lightskyblue', zorder = 1)) + ax.add_artist(patches.Rectangle((min_x, y ), abs(x-min_x), max_y+abs(y), facecolor = 'lightcoral', zorder = 1)) elif key == 3: - ax.add_artist(patches.Rectangle((x, y), 1000, 1000, facecolor = 'lightskyblue', zorder = 1)) + ax.add_artist(patches.Rectangle((min_x, min_y), abs(x-min_x), abs(y-min_y), facecolor = 'lightcoral', zorder = 1)) else: - ax.add_artist(patches.Rectangle((0, y ), x, 1000, facecolor = 'lightskyblue', zorder = 1)) - - - #for bp in bpr: - # x, y = bp - - - # if key == 1: - # ax.add_artist(patches.Rectangle((x, y), 1000, 1000, facecolor ='lightcoral', zorder = 1)) - # elif key == 2: - # ax.add_artist(patches.Rectangle((0, y ), x, 1000, facecolor = 'lightcoral', zorder = 1)) - # elif key == 3: - # ax.add_artist(patches.Rectangle((0.0, 0.0), x, y, facecolor = 'lightcoral', zorder = 1)) - # else: - # ax.add_artist(patches.Rectangle((x, 0), 1000, y, facecolor = 'lightcoral', zorder = 1)) + ax.add_artist(patches.Rectangle((x, min_y), max_x+abs(x), abs(y-min_y), facecolor = 'lightcoral', zorder = 1)) random.shuffle(data) for d in data: if d[2] == 0: - plt.scatter(d[0][0], d[0][1], c = 'blue', marker='.', zorder = 2) + plt.scatter(d[0][0], d[0][1], c = 'royalblue', marker='.', zorder = 2) elif d[2] == 1: - plt.scatter(d[0][0], d[0][1], c = 'red', marker='.', zorder = 2) - plt.xlabel(g1) - plt.ylabel(g2) + plt.scatter(d[0][0], d[0][1], c = 'firebrick', marker='.', zorder = 2) + plt.xlabel(p1) + plt.ylabel(p2) - if pathname is not None: - plt.savefig(pathname + g1 + '_' + g2 + '.png') + if cm is not None: + pair = '/'.join([p1, p2, str(key)]) + error = cm.at['LOOCV', pair] + plt.title('RE = {} & LOOCVE = {}'.format(round(reg_err/len(data),3),round(error,3))) + else: + plt.title('RE = {}'.format(round(reg_err/len(data),3))) - f = open(pathname + 'gene.txt', 'a') - f.write('{} : {}\n'.format(g1, p1)) - f.write('{} : {}\n'.format(g2, p2)) - f.close() + if pathname is not None: + name = pathname + '/' + str(p1) + '_' + str(p2) + '.pdf' + plt.savefig(name.replace('//', '/'), bbox_inches='tight') else: plt.show() +def show_results_pos(df, pairs, nbcpus, pathname, cm): + pool = mp.Pool(nbcpus) + vals = [(p, df) for p in pairs] + + res = pool.starmap(cr_models, vals, max(1,len(vals)//nbcpus) ) + + for r in res: + p1, p2, models, data = r + print_model_positive(data, models, p1, p2, pathname, cm) + -def show_results(df, probs_df, pairs, nbcpus, pathname, cm): +def show_results_neg(df, pairs, nbcpus, pathname, cm): pool = mp.Pool(nbcpus) vals = [(p, df) for p in pairs] @@ -500,7 +385,7 @@ def show_results(df, probs_df, pairs, nbcpus, pathname, cm): for r in res: p1, p2, models, data = r - print_model(data, models, p1, p2, probs_df, pathname, cm) + print_model_negative(data, models, p1, p2, pathname, cm) def show_results_RS(df, probs_df, pairs, nbcpus, pathname, cm): pool = mp.Pool(nbcpus) diff --git a/Module/stages.py b/Module/stages.py index 1fc83501fed88676e416dd82d8c3ec10a9141520..71222eecb10e9825b78b7136d59ad164fbea633e 100755 --- a/Module/stages.py +++ b/Module/stages.py @@ -3,8 +3,6 @@ from Module import optimal_k_aggregations as oka from Module import monotonic_regression_uncertainty as mru from Module import tools from Module import selection_algorithm as sa -from Module import preselection as psl - from Module import dynamic_preselection as dpsl import pandas as pd @@ -12,56 +10,38 @@ import matplotlib.pyplot as plt import os -from sklearn.metrics import roc_auc_score, confusion_matrix, matthews_corrcoef - - -def stage0_old(df, nbcpus, threshold): - reg = psl.regression_error_matrix(df, nbcpus) - if threshold is not None: - reg = psl.preselection_reg_err(reg, threshold) - return reg -def stage0(df, nbcpus, m, log): +def stage0(df, nbcpus, m): config = dpsl.all_configurations(df) - Q, pairs = dpsl.algorithm_2(config, df, m, nbcpus, log) + Q, pairs = dpsl.algorithm_2(config, df, m, nbcpus) return pairs -def stage_1(df, cls, min_k, max_k, nbcpus, strat, funct, log): - k_error = oka.k_missclassification(df, cls, nbcpus, funct, strat, min_k, max_k, log) +def stage_1(df, cls, min_k, max_k, nbcpus, strat, funct): + k_error = oka.k_missclassification(df, cls, nbcpus, funct, strat, min_k, max_k) k_opt, err_k = oka.optimal_k(k_error) return k_opt, k_error -def stage_2(df, cls, k_opt, auc_file, conf_mat_file, nbcpus, funct, strat, logs): +def stage_2(df, cls, k_opt, auc_file, nbcpus, funct, strat): errors, y_true, y_pred, y_proba = list(), list(), list(), list() for i in range(len(df)): out = df.iloc[i, :] df_2 = df.drop([i]) df_2.reset_index(drop=True, inplace=True) - f = open(logs, 'a') - f.write('Patient {} \n'.format(i)) - f.close() ndf_err = cmu.error_matrix(df_2, cls, nbcpus,funct) cost = cmu.cost_classifiers(ndf_err) - f = open(logs, 'a') - f.write('Matrix computed \n') - f.close() mve, pairs, algo = oka.find_k_metamodel(df_2, ndf_err, cost, k_opt, nbcpus, strat) pred, proba = oka.create_and_predict_metamodel(df_2, out, pairs, nbcpus, funct) - f = open(logs, 'a') - f.write('Pred={}, Proba={} \n'.format(pred, proba)) - f.close() - errors.append(abs(out['target']-pred)) y_proba.append(proba) y_true.append(out['target']) @@ -70,21 +50,13 @@ def stage_2(df, cls, k_opt, auc_file, conf_mat_file, nbcpus, funct, strat, logs) acc = errors.count(0)/len(errors) auc = tools.auc_score(y_proba, y_true, auc_file) - auc2 = roc_auc_score(y_true, y_proba) - CI = tools.confidence_interval(auc, y_true) - conf_mat = confusion_matrix(y_true, y_pred) - mcc = matthews_corrcoef(y_true, y_pred) - - f = open(logs, 'a') - f.write('acc={}, auc={}, CI = {}, aucsk = {}, confusion_matrix = {}, mcc = {} \n'.format(acc, auc, CI, auc2, conf_mat, mcc)) - f.close() + ci = tools.confidence_interval(auc, y_true) labels, probas, uncertain_pts = tools.unclassified_points(y_true, y_proba) - return acc, auc, CI, conf_mat + return acc, auc, ci -def stage_3(df, cls, k_opt, cost_mat, nbcpus, funct, strat): +def stage_3(df, cls, k_opt, nbcpus, funct, strat): ndf_err = cmu.error_matrix(df, cls, nbcpus,funct) - ndf_err.to_csv(cost_mat) cost = cmu.cost_classifiers(ndf_err) mve, pairs, algo = oka.find_k_metamodel(df, ndf_err, cost, k_opt, nbcpus, strat) - return pairs, mve + return pairs, ndf_err diff --git a/Module/tools.py b/Module/tools.py index 273b80dc2abe117989f6c1f5a1732c84feb50ea8..02401a4c84ccea4402e39cae7d2cb07e2f16186d 100755 --- a/Module/tools.py +++ b/Module/tools.py @@ -122,15 +122,16 @@ def auc_score(probas, labels, auc_file=None): CI = confidence_interval(auc3, labels) if auc_file is not None: - plt.plot(tpr_array, fpr_array, linestyle='--', color='darkorange', lw = 2, label='ROC curve', clip_on=False) - plt.plot([0, 1], [0, 1], color='navy', linestyle='--') + plt.plot(tpr_array, fpr_array, color='darkorange', label='ROC curve') + plt.plot([0, 1], [0, 1], color='gray', linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('AUC={}, CI=[{};{}] '.format(round(auc3,3), CI[0], CI[1])) plt.legend(loc="lower right") - plt.savefig(auc_file) + name = auc_file + '/auc.pdf' + plt.savefig(name.replace('//', '/'), bbox_inches='tight') return auc3 diff --git a/README.md b/README.md index 9bc3c913ab9cf1da7e5ce7feef2b113b080fc917..0f312714d099f38b7bdc4fe1cab9b99b199adb3b 100644 --- a/README.md +++ b/README.md @@ -5,36 +5,22 @@ Author: Océane FOURQUET This project aims to construct an ensemble model of bidimensional monotonic classifiers[1](https://link.springer.com/article/10.1007/s00453-012-9628-4). In addition to reimplementing an established approach[2](https://academic.oup.com/jid/article/217/11/1690/4911472?login=true) in python, it integrates a preselection of the pairs of features, reducing drastically the running time of the approach. ## Python and librairies versions -- Python 3.9.1 -- Pandas 1.2.3 -- Numpy 1.19.2 -- Matplotlib 3.3.4 +- python 3.9.1 +- pandas 1.2.3 +- numpy 1.19.2 +- matplotlib 3.3.4 +- multiprocessing + +This code uses the multiprocessing library to parallelise calculations. By default, the number of CPUs is set to 1, but it is possible to change this configuration with the parameters. ## Run the code ### Data -Data should be presented in csv format, in the form of samples per row and features per column, with a 'target' column for classes. +Data should be presented in csv format, in the form of samples per row and features per column, with a labels column for classes. For the moment, this code only allows to work on data with two classes. ### Code -The code is split in 3 stages. Stage 1: determine the optimal number of classifiers to construct the metamodel. Stage 2: compute an estimate of the AUC score of a metamodel constructed with k_opt classifiers. Stage 3: construct the final metamodel from the whole dataset. The three stages are coded in the py file stages.py, available in the Module. - -To run the whole project, you can run full\_project.py with as follow: - -python3 full\_project.py <dataset> <probeset> <outpath> - -ex : python3 full\_project.py dengue/dengue.csv dengue/probeset_annotations.txt Result/ > log.txt - -To run only one stage, you can use the relevant py file among stage1.py, stage2.py and stage3.py. - -Note that by default, this code uses the 3-classes classification (severe, non severe and uncertain). If you want to compute the metamodel by favoring severe or non severe (2-classes classification), you must change the parameter in the file full\_project.py. - - -## Files in Module -- cost\_matrix\_uncertainty.py -- measures.py -- monotonic\_regression\_uncertainty.py -- optimal\_k\_aggregations.py -- selection\_algorithm.py -- show\_results.py -- stages.py -- tools.py +To run the approach with the default parameters, just run the following command in your shell : +```python +python3 run_mem.py <dataset> +``` +There are some optional parameters such as the number of cpus to use for calculations, the maximum number of pairs for the ensemble model, the name of the label column in the dataset file, ... diff --git a/run_mem.py b/run_mem.py new file mode 100644 index 0000000000000000000000000000000000000000..c23dbc47ad06c65356132ed634d348f8eb193eb5 --- /dev/null +++ b/run_mem.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +from Module import stages as st +from Module import tools +from Module import monotonic_regression_uncertainty as mru +from Module import selection_algorithm as sa +from Module import optimal_k_aggregations as oka +from Module import show_results as sr + + +import numpy as np +import pandas as pd +import sys +import argparse + +def parse_args(): + parser=argparse.ArgumentParser(description="Monotonic Ensemble Model approach") + parser.add_argument("dataset", help="dataset in csv format") + parser.add_argument("--target", help="name of the column of the classes", default='target') + parser.add_argument("--nbcpus", help="number of cpus to use to do calculation", type=int, default=1) + parser.add_argument("--m", help="number of minimum disjoint pairs for preselection", type=int, default=30) + parser.add_argument("--favoring", help="class to favor in the model between 0 (control case) and 1 (cohort study)", default = 1, type=int, choices=[0,1]) + parser.add_argument("--outdir", help="directory for out files", default='./') + #parser.add_argument("--selection", help="selection strategy to use for the ensemble") + args=parser.parse_args() + return args + + +def verify_nb_classes_dataset(df, target): + if target in df.columns: + tt = set(df[target]) + if len(tt) != 2: + print('Expected number of classes: 2. There are {} classes in the dataset.\n'.format(len(tt))) + return False + else: + return True + else: + print("Can't find the class labels column. Configure the name of the labels column with argument --target.\n") + + +def modify_label_classes_dataset(df, target): + if not set(df[target]) == {0,1}: + tt = list(set(df[target])) + df[target] = df[target].map({tt[0]:0,tt[1]:1}) + df.rename({target:'target'}, axis=1, inplace=True) + return df + + +def main(): + inputs=parse_args() + + strat = [sa.NB] + if inputs.favoring == 1: + funct= mru.predict_severe + else: + funct= mru.predict_non_severe + + try: + df = pd.read_csv(inputs.dataset,index_col=0, low_memory=False) + df.reset_index(drop=True, inplace=True) + except: + print("Can't open the file {}. Check the format.\n".format(inputs.dataset)) + + if not verify_nb_classes_dataset(df, inputs.target): + sys.exit() + + df = modify_label_classes_dataset(df, inputs.target) + + + cls = st.stage0(df, inputs.nbcpus, inputs.m) + + k_opt, k_error = st.stage_1(df, cls, 2, inputs.m, inputs.nbcpus, strat, funct) + + print('Between 2 and {}, the minimal optimal number of classifiers for the ensemble model is {}. \n'.format(inputs.m, k_opt)) + + acc, auc, ci = st.stage_2(df, cls, k_opt, inputs.outdir, inputs.nbcpus, funct, strat) + + print('The estimated performance of the monotonic ensemble model is AUC = {} +/- {}. \n'.format(auc, ci)) + + pairs, cm = st.stage_3(df, cls, k_opt, inputs.nbcpus, funct, strat) + + print('The ensemble model is made of the following pairs : ') + for pair in pairs: + print('- {} \n'.format(pair[:-2])) + + if inputs.favoring == 1: + sr.show_results_pos(df, pairs, inputs.nbcpus, inputs.outdir, cm) + else: + sr.show_results_neg(df, pairs, inputs.nbcpus, inputs.outdir, cm) + + + + + + + + +if __name__ == "__main__": + main()