diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..eddf048df7bdf3062d661508dc2c5d7649330bb9 Binary files /dev/null and b/.DS_Store differ diff --git a/Module/.DS_Store b/Module/.DS_Store new file mode 100755 index 0000000000000000000000000000000000000000..19b82036d32618a951840f33d2c93ee7eff892ee Binary files /dev/null and b/Module/.DS_Store differ diff --git a/Module/__pycache__/cost_matrix_uncertainty.cpython-38.pyc b/Module/__pycache__/cost_matrix_uncertainty.cpython-38.pyc new file mode 100755 index 0000000000000000000000000000000000000000..02ad757007d86da1fdf1845f98ec0504962353c1 Binary files /dev/null and b/Module/__pycache__/cost_matrix_uncertainty.cpython-38.pyc differ diff --git a/Module/__pycache__/cost_matrix_uncertainty.cpython-39.pyc b/Module/__pycache__/cost_matrix_uncertainty.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a217a1b2a57daeee20a7c2f30411d77c5736faac Binary files /dev/null and b/Module/__pycache__/cost_matrix_uncertainty.cpython-39.pyc differ diff --git a/Module/__pycache__/cost_matrix_uncertainty_v2.cpython-39.pyc b/Module/__pycache__/cost_matrix_uncertainty_v2.cpython-39.pyc new file mode 100755 index 0000000000000000000000000000000000000000..fee8989c7e76919dca326645a40d09079d55e498 Binary files /dev/null and b/Module/__pycache__/cost_matrix_uncertainty_v2.cpython-39.pyc differ diff --git a/Module/__pycache__/measures.cpython-38.pyc b/Module/__pycache__/measures.cpython-38.pyc new file mode 100755 index 0000000000000000000000000000000000000000..cc8175622ee2cee526a076e2f7047409622c7828 Binary files /dev/null and b/Module/__pycache__/measures.cpython-38.pyc differ diff --git a/Module/__pycache__/measures.cpython-39.pyc b/Module/__pycache__/measures.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa3d020173484e9780d4dc91fd10532564f41f8b Binary files /dev/null and b/Module/__pycache__/measures.cpython-39.pyc differ diff --git a/Module/__pycache__/monotonic_regression_4cases.cpython-39.pyc b/Module/__pycache__/monotonic_regression_4cases.cpython-39.pyc new file mode 100755 index 0000000000000000000000000000000000000000..bd6abcdda3556f92104a4c16465af1e8423ebdf4 Binary files /dev/null and b/Module/__pycache__/monotonic_regression_4cases.cpython-39.pyc differ diff --git a/Module/__pycache__/monotonic_regression_uncertainty.cpython-38.pyc b/Module/__pycache__/monotonic_regression_uncertainty.cpython-38.pyc new file mode 100755 index 0000000000000000000000000000000000000000..28e8f1dd48afe5f00fe10e507afc658d552be6dc Binary files /dev/null and b/Module/__pycache__/monotonic_regression_uncertainty.cpython-38.pyc differ diff --git a/Module/__pycache__/monotonic_regression_uncertainty.cpython-39.pyc b/Module/__pycache__/monotonic_regression_uncertainty.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c149e50c32e14944c7ad6e4e464d285e645b056 Binary files /dev/null and b/Module/__pycache__/monotonic_regression_uncertainty.cpython-39.pyc differ diff --git a/Module/__pycache__/monotonic_regression_uncertainty_bis.cpython-39.pyc b/Module/__pycache__/monotonic_regression_uncertainty_bis.cpython-39.pyc new file mode 100755 index 0000000000000000000000000000000000000000..04a4aefe1d879f49104637b4568a0f386646de2d Binary files /dev/null and b/Module/__pycache__/monotonic_regression_uncertainty_bis.cpython-39.pyc differ diff --git a/Module/__pycache__/monotonic_regression_uncertainty_v1.cpython-39.pyc b/Module/__pycache__/monotonic_regression_uncertainty_v1.cpython-39.pyc new file mode 100755 index 0000000000000000000000000000000000000000..13c345d2299c2e6441fae49da50f0d2987522f93 Binary files /dev/null and b/Module/__pycache__/monotonic_regression_uncertainty_v1.cpython-39.pyc differ diff --git a/Module/__pycache__/monotonic_regression_uncertainty_v2.cpython-39.pyc b/Module/__pycache__/monotonic_regression_uncertainty_v2.cpython-39.pyc new file mode 100755 index 0000000000000000000000000000000000000000..9939f22b40b2163c8755fa1788c7c3564f8c83d1 Binary files /dev/null and b/Module/__pycache__/monotonic_regression_uncertainty_v2.cpython-39.pyc differ diff --git a/Module/__pycache__/optimal_k_aggregations.cpython-38.pyc b/Module/__pycache__/optimal_k_aggregations.cpython-38.pyc new file mode 100755 index 0000000000000000000000000000000000000000..816929b315143cd31076699cc0e8535a07a2a73c Binary files /dev/null and b/Module/__pycache__/optimal_k_aggregations.cpython-38.pyc differ diff --git a/Module/__pycache__/optimal_k_aggregations.cpython-39.pyc b/Module/__pycache__/optimal_k_aggregations.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e6a22c8bb46763329272e0b152ec47e7ee05a9b Binary files /dev/null and b/Module/__pycache__/optimal_k_aggregations.cpython-39.pyc differ diff --git a/Module/__pycache__/optimal_k_aggregations_v2.cpython-39.pyc b/Module/__pycache__/optimal_k_aggregations_v2.cpython-39.pyc new file mode 100755 index 0000000000000000000000000000000000000000..1b9b748de6ab78bbb6b29f9740d07830ad6d9580 Binary files /dev/null and b/Module/__pycache__/optimal_k_aggregations_v2.cpython-39.pyc differ diff --git a/Module/__pycache__/preselection.cpython-39.pyc b/Module/__pycache__/preselection.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..335c7d27405aec0aade12dd43f1cd38612ce4370 Binary files /dev/null and b/Module/__pycache__/preselection.cpython-39.pyc differ diff --git a/Module/__pycache__/selection_algorithm.cpython-38.pyc b/Module/__pycache__/selection_algorithm.cpython-38.pyc new file mode 100755 index 0000000000000000000000000000000000000000..9befda9e9bfaac2b829c9dab5d20e939f10dbd56 Binary files /dev/null and b/Module/__pycache__/selection_algorithm.cpython-38.pyc differ diff --git a/Module/__pycache__/selection_algorithm.cpython-39.pyc b/Module/__pycache__/selection_algorithm.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79170c5987605a38fd82c88271acc26b149a7405 Binary files /dev/null and b/Module/__pycache__/selection_algorithm.cpython-39.pyc differ diff --git a/Module/__pycache__/selection_algorithm_v2.cpython-39.pyc b/Module/__pycache__/selection_algorithm_v2.cpython-39.pyc new file mode 100755 index 0000000000000000000000000000000000000000..ab8ab13c851d4de362981d471ef9a7eae3e789b6 Binary files /dev/null and b/Module/__pycache__/selection_algorithm_v2.cpython-39.pyc differ diff --git a/Module/__pycache__/show_results.cpython-38.pyc b/Module/__pycache__/show_results.cpython-38.pyc new file mode 100755 index 0000000000000000000000000000000000000000..c5b962eab0a56c10c738cb71b723ac51d1ed1c15 Binary files /dev/null and b/Module/__pycache__/show_results.cpython-38.pyc differ diff --git a/Module/__pycache__/show_results.cpython-39.pyc b/Module/__pycache__/show_results.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a77523ab471eb02977f182b7253a9182f0e5c99 Binary files /dev/null and b/Module/__pycache__/show_results.cpython-39.pyc differ diff --git a/Module/__pycache__/show_results_4cases.cpython-38.pyc b/Module/__pycache__/show_results_4cases.cpython-38.pyc new file mode 100755 index 0000000000000000000000000000000000000000..1ab8bd2cb097ee1e24327eb042b443465987f776 Binary files /dev/null and b/Module/__pycache__/show_results_4cases.cpython-38.pyc differ diff --git a/Module/__pycache__/show_results_4cases.cpython-39.pyc b/Module/__pycache__/show_results_4cases.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b60c307a3f8c7658a1ad25c34c2e454dc7179053 Binary files /dev/null and b/Module/__pycache__/show_results_4cases.cpython-39.pyc differ diff --git a/Module/__pycache__/stages.cpython-38.pyc b/Module/__pycache__/stages.cpython-38.pyc new file mode 100755 index 0000000000000000000000000000000000000000..b0ce31ee2ee72d1a36d64a24d9fc7c3d44abd849 Binary files /dev/null and b/Module/__pycache__/stages.cpython-38.pyc differ diff --git a/Module/__pycache__/stages.cpython-39.pyc b/Module/__pycache__/stages.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f1b1d8b631f5a1ddbd8bdaa280656de911f2e391 Binary files /dev/null and b/Module/__pycache__/stages.cpython-39.pyc differ diff --git a/Module/__pycache__/tools.cpython-38.pyc b/Module/__pycache__/tools.cpython-38.pyc new file mode 100755 index 0000000000000000000000000000000000000000..8f2638c1b65f0190d22036d3ad42dcc974ea8167 Binary files /dev/null and b/Module/__pycache__/tools.cpython-38.pyc differ diff --git a/Module/__pycache__/tools.cpython-39.pyc b/Module/__pycache__/tools.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..03b72c0092087876efdff4564d67f0237de44159 Binary files /dev/null and b/Module/__pycache__/tools.cpython-39.pyc differ diff --git a/Module/cost_matrix_uncertainty.py b/Module/cost_matrix_uncertainty.py new file mode 100755 index 0000000000000000000000000000000000000000..9ef409080eda4f19db6c9dc727fd0d9362204457 --- /dev/null +++ b/Module/cost_matrix_uncertainty.py @@ -0,0 +1,150 @@ +from Module import monotonic_regression_uncertainty as mru +from Module import tools + + +import numpy as np +import pandas as pd +import os +import time +from itertools import chain +import copy + + +import multiprocessing as mp + + +### Useful functions for parallele + +def vals_mp(pairs, df_2, out, funct): + vals = list() + for p in pairs: + vals.append((p, df_2, out, funct)) + return vals + + +#### ERROR MATRIX ###### + + +def single_error(p, df_2, out, funct): + + p1, p2, key = p.split('/') + key = int(key) + rev, up = tools.equiv_key_case(key) + + diag = df_2['target'].values.tolist() + + tr1, tr2 = df_2[p1].values.tolist(), df_2[p2].values.tolist() + + data = [((tr1[n], tr2[n] ), 1, diag[n]) for n in range(len(diag))] + out_p = (out[p1], out[p2]) + X, models = mru.compute_recursion(data, (rev, up, key)) + + reg_err, bpr, bpb, r_p, b_p = models[key] + pred = funct(out_p, bpr, bpb, rev, up) + + if pred == -1: + return (p, -1) #if uncertain, we keep it like this + else: + return (p, abs(1-int(pred == out['target']))) #int(True) = 1 so if the pred is equal to real label, error is equal to 0 + + +def error_matrix(df_, pairs, nbcpus, funct): + try: + nbcpus = int (os.getenv('OMP_NUM_THREADS') ) + except: + pass + pool = mp.Pool(nbcpus) + + + df = copy.deepcopy(df_) + + index = list() + + mat_err = pd.DataFrame(columns = pairs + ['target']) #Dataframe with possible classifiers as columns + + for j in range(len(df)):# For each patient j, we add a line to the dataframe contaning whereas the patient was misclassified or not (1 if misclassified, 0 otherwise) + out = df.iloc[j, :] + df_2 = df.drop([j])#classifiers are constructed according to the set of patients without patient j + df_2.reset_index(drop=True, inplace=True) + + vals = vals_mp(pairs, df_2, out, funct) + + res = pool.starmap(single_error, vals, max(1,len(vals)//nbcpus)) #res is an array (size = nb of pairs) that the name of the classifier and whereas the poatient was misclassified or not + + dico_err = {r[0] : r[1] for r in res} + dico_err['target'] = out['target'] + dico_err_s = pd.Series(dico_err) + dico_err_s.name = 'P'+str(j+1) + mat_err = pd.concat((mat_err, dico_err_s.to_frame().T), axis=0) + del df_2 + + + + + unc = {col: mat_err[col].to_list().count(-1) for col in pairs} + unc['target'] = np.nan + unc_s = pd.Series(unc) + unc_s.name = 'uncertain' + + mat_err_unc = pd.concat((mat_err,unc_s.to_frame().T), axis=0) + + + + cols = list(mat_err_unc.columns) + cols.remove('target') + rem = list() + for col in cols: + val = mat_err_unc.at['uncertain', col] + if val > len(df)/3: + rem.append(col) + mat_err.drop(rem, axis=1, inplace=True) + mat_err_unc.drop(rem, axis=1, inplace=True) + + err = {col: mat_err[col].to_list().count(1)/(mat_err[col].to_list().count(1) + mat_err[col].to_list().count(0)) for col in pairs if col not in rem} + err['target'] = np.nan + err_s = pd.Series(err) + err_s.name = 'error' + + mat_err_final = pd.concat((mat_err_unc,err_s.to_frame().T), axis=0) + + + mat_err_final.sort_values(axis = 1, by=['error', 'uncertain'], inplace=True) + + del df + return mat_err_final + + + + +#### GOING error matrix to prediction matrix +def error_to_prediction(matrix, df): + diags = df['target'].values.tolist() + + prediction_mat = {} + for cls in matrix.keys(): + if cls != 'target': + errors = matrix[cls] + pred = list() + for i in range(len(errors)): + if errors[i] == 0: + pred.append(diags[i]) + elif errors[i] == 1: + pred.append(int(abs(1-diags[i]))) + elif errors[i] == -1: + pred.append(-1) + prediction_mat[cls] = pred + + return prediction_mat + + + + + +### Get dict relating classifiers with error score + + +def cost_classifiers(ndf): + cols = list(ndf.columns) + cols.remove('target') + cost = {cols[i] : ndf[cols[i]].loc[['error']][0] for i in range(len(cols))} + return cost diff --git a/Module/measures.py b/Module/measures.py new file mode 100755 index 0000000000000000000000000000000000000000..1ea55334244673d9a10e397bea978fa31a0c346f --- /dev/null +++ b/Module/measures.py @@ -0,0 +1,131 @@ +import math + + +#cls = classifier +# boolean prediction : 0 if the prediction is correct, 1 if it's wrong +#pred_xi : array of the boolean prediction of xi over the M classifiers +#set (size MxN) is a list of the boolean prediction of the M classifiers over the N patients + +def m_xi(pred_xi): + #number of classifiers producing error for the input sample xi + #pred_xi is the boolean prediction of xi over the M classifiers + return sum(pred_xi) + +def error_rate_clj(pred_clj): + # error rate of jth classifier + #pred_clj is the boolean prediction of the N patients by classifier j + return sum(pred_clj)/len(pred_clj) + +def ensemble_mean_error_rate(set): + #average error rate over the M classifiers + #set (size MxN) is a list of the boolean prediction of the M classifiers over the N patients + M = len(set) + e = 0 + for i in range(M): + e += error_rate_clj(set[i]) + return e/M + + + +def yi_MV_1(pred_xi): + #pred_xi is the boolean prediction of xi over the M classifiers + #majority boolean prediction error in favor of the wrong pred 1 + M = len(pred_xi) + if m_xi(pred_xi) >= M/2: + #if the nb of misclassification is greater than or equal to the nb of classifiers + #then the preditcion made with majority voting is wrong + return 1 + else: + return 0 + +def yi_MV_0(pred_xi): + #pred_xi is the boolean prediction of xi over the M classifiers + #majority boolean prediction error in favor of correct pred 0 + M = len(pred_xi) + if m_xi(pred_xi) > M/2: + #if the nb of misclassification is greater than the nb of classifiers + #then the preditcion made with majority voting is wrong + return 1 + else: + return 0 + +def MVE(set, meth = yi_MV_1): + #majority voting error rate + M = len(set) #Nb of classifiers + N = len(set[0]) #Nb of patients + mve = 0 + for i in range(N): #For each patient i + #construction of pred_xi + pred_xi = list() + for j in range(M):#For each classifier j + pred_xi.append(set[j][i]) #We add the misclassification error of patient i for the classifier j + + yi_mv = meth(pred_xi) #Whereas the patient i was misclassified or not according to the ensemble + + mve += yi_mv + return mve/N + + + +def D2_ij(pred1, pred2): + #disagreement measure for 2 pairs + #pred1 and pred2 (size N) are the output of classifiers for the N patients + N = len(pred1) + D2 = 0 + for i in range(N): + if pred1[i] != pred2[i]: + D2 += 1 + return D2/N + +def D2(set): + #average disagreement measure over all pairs of a set + #set (size MxN) is a list of the boolean prediction of the M classifiers over the N patients + M = len(set) + D2 = 0 + for i in range(M): + for j in range(i+1, M): + if i != j: + D2 += D2_ij(set[i], set[j]) + + return (2*D2)/(M*(M-1)) + + +def F2_ij(pred1, pred2): + #double fault measure + #pred1 and pred2 (size N) are the output of classifiers for the N patients + N = len(pred1) + N11 = 0 + for i in range(N): + if pred1[i] == 1 and pred2[i] == 1: + #double fault + N11 +=1 + return N11/N + +def F2(set): + #average double fault measure over all pairs of a set + #set (size MxN) is a list of the boolean prediction of the M classifiers over the N patients + M = len(set) + F2 = 0 + for i in range(M): + for j in range(i+1, M): + if i != j: + F2 += F2_ij(set[i], set[j]) + return (2*F2)/(M*(M-1)) + +def entropy(set): + #set (size MxN) is a list of the boolean prediction of the M classifiers over the N patients + #The entropy measure reaches its maximum (EN 1â„4 1) for the highest disagreement, which is the case of observing M/2 votes with identical value (0 or 1) and + #M M/2 with the alternative value. The lowest entropy (EN 1â„4 0) is observed if all classifier outputs are + #identical. + M = len(set) + N = len(set[0]) + + EN = 0 + for i in range(N): + #construction of pred_xi + pred_xi = list() + for j in range(M): + pred_xi.append(set[j][i]) + + EN += min(m_xi(pred_xi), M-m_xi(pred_xi)) / (M-math.ceil(M/2)) + return EN/N diff --git a/Module/monotonic_regression_uncertainty.py b/Module/monotonic_regression_uncertainty.py new file mode 100755 index 0000000000000000000000000000000000000000..48cb10f1ea20d7b878b99abeb19c685da8f44159 --- /dev/null +++ b/Module/monotonic_regression_uncertainty.py @@ -0,0 +1,778 @@ +from Module import tools + +import numpy as np +from math import ceil, log, pow +import pandas as pd +import os +import multiprocessing as mp +import copy +import time + + + +#Useful functions + +def err(v, w, z): #error function defined in the paper + return w*abs(v-z) + +next_ = lambda x: int( pow(2, ceil(log(x, 2)))) + +def index_leaves(A, k): + #As we use a balanced binary tree, leaves can be at at most two levels. This function constructs + #a dictionary linking each leaf index to the index of the corresponding node in the tree + #A is an array contaning all the values of the nodes + #k is the number of leaves in the tree + p_lev = next_(k) - k + lev = k - p_lev + ind_leaves = {} + for i in range(1, lev+1): + ind_leaves[i] = (len(A)-lev) + i + for i in range(1, p_lev+1): + ind_leaves[lev+i ] = (len(A)-k) + i + return ind_leaves + +def is_leaf(ind_leaves, num): + #confirms or not whether the node under study is a leaf + if num in ind_leaves.values(): + return True + else: + return False + +def find_leaves(ind_leaves, num, L): + #return a list with all the leaves below the node num + #ind_leaves is the dictionary with leaf index as key and node index as values + #num is the the node we are studying + #L is the list we are updating with the leaves + + if not is_leaf(ind_leaves, num): + find_leaves(ind_leaves, 2*num, L) + find_leaves(ind_leaves, 2*num +1, L) + else: + L.append(num) + +def Z_(H, A, ind_leaves): + #Show Z : this one won't appear in the final programm. It's just to check the tree for the moment + Z = list() + for i in range(1,len(H)+1): + v_i = ind_leaves[i] + Z.append(int(compute_Z(A,v_i))) + print('Z', Z) + return Z + +def is_A_balanced(A): + flag = True + for i in range(1, len(A), 2): + if A[i] != 0 and A[i+1] != 0: + flag = False + return flag + +########Initialize A +def initialization(data, rev): + #rev enable us to test increasing and decreasing isotonicity + #X is an array with sorted data + #H is a soretd list of all column values + #A is the array initialised at zero + #ind_leaves is the dictionnary linking each leaf index to the index of the corresponding node in the tree + X = sorted(data, reverse = rev) + H = sorted(list(set([X[i][0][1] for i in range(len(X))]))) + A = np.zeros(2*len(H)+1) + H.append(float('inf')) + ind_leaves = index_leaves(A, len(H)) + return X, H, A, ind_leaves + + +#####STEP 1 : compute Z and Z(g,c_g) +def compute_Z(A,v): + #compute Z by going up the tree from leaf v to the root (Z(g,h) = sum of z_v on the path to the root) + Z = A[v-1] + while True: + if v == 1: + break + v = v//2 + Z += A[v-1] + return Z + +#These two functions help for updating the tree when Z(g,c_g) is computed (it can change a lot in the path to the root) +def degenerate_interval(A, v, ind_leaves): + p = v //2 + mod = v%2 + while True: + if p == 1: + break + if A[p-1] != 0: + add_value(A, p, v, ind_leaves) + A[p-1] = 0 + p = p//2 + mod = p%2 + + + +def add_value(A, p, v, ind_leaves): + L = list() + find_leaves(ind_leaves, p, L) + for l in L: + if l != v: + A[l-1] += A[p-1] + + +def rebalance(A,v, ind_leaves): + if v != 0: + p = v//2 + mod = v%2 + w = 2*p+(1-mod) + if A[v-1] != 0 and A[w-1] !=0: + delta = min(A[v-1], A[w-1]) + A[p-1] += delta + A[v-1] -= delta + A[w-1] -= delta + else: + mini = float('inf') + for i in ind_leaves.values(): + Z = compute_Z(A,i) + if Z < mini: + mini = Z + A[0] = mini + + + + +def step1(ind_cg, A, nb_leaves, ind_leaves, err1, S, H): + #t0 = time.process_time() + mini = float('inf') + h = 0 + c = 1 #not sure of this + for i in range(ind_cg, nb_leaves+1): + v_i = ind_leaves[i] + Z = compute_Z(A,v_i) + + if Z < mini: + mini = Z + ind = i + c = 1 + h = H[i-1] + elif Z == mini: + c+=1 + if c>1: + while compute_Z(A,ind_leaves[ind]) == mini and ind < nb_leaves: + ind+=1 + if ind == nb_leaves: + h = H[ind-1] #value of the leaves that give minimum Z(g,h) + else: + h = H[ind-2] + if len(S) ==0: + h=H[0] + S.append(h) + + + + deg = compute_Z(A,ind_leaves[ind_cg]) - A[ind_leaves[ind_cg]-1] #sum of z_v on the path from c_g to the root, but minus the leaf c_g + + #if the new value of Z(g,c_g) minus deg is greater or equal to 0, then the value in A for leaf c_g is equal to the difference + # however, in the other case, it means that we have to change the path to make it correspond + + + if deg <= mini + err1: + A[ind_leaves[ind_cg]-1] = mini + err1 - deg + else: + A[ind_leaves[ind_cg]-1] = mini + err1 + degenerate_interval(A, ind_leaves[ind_cg], ind_leaves) + #print('Step1', time.process_time() - t0) + + + + + + +#######STEP 2 : update right and left + +#Update right c_g +def v_has_right(v, A): + #indicates whether the node v has a right child branch + if 2*v+1 <= len(A): + return True + else: + return False + +def update_right(v, A, val): + #update right child branch by adding val + A[2*v] += val + +def update_all_right(v, A, err0): + p = v//2 + while True: + if v_has_right(p, A) and 2*p+1 !=v: + update_right(p, A, err0) + + if p == 1: #end when root has been updated + break + v = p + p = p//2 + + + +#Update left c_g +def v_has_left(v, A): + #indicates whether the node v has a left child branch + if 2*v <= len(A): + return True + else: + return False + +def update_left(v, A, val): + #update left child branch by adding val + A[2*v-1] += val + + + +def update_all_left(v, A, err1): + p = v//2 + while True: + if v_has_left(p, A) and 2*p != v: + update_left(p, A, err1) + + if p == 1: #end when root has been updated + break + v = p + p = p//2 + + +def step2(A, v, err0, err1): + #t0 = time.process_time() + #add the error in left and right intervals + update_all_right(v, A, err0) + update_all_left(v, A, err1) + #print('Step2', time.process_time() - t0) + + + + + +########RECURSION + +def recursion(A, H, c_g, ind_leaves, err0, err1, nb_leaves, S): + ind_cg = H.index(c_g) + 1 + v_cg = ind_leaves[ind_cg] + + step1(ind_cg, A, nb_leaves, ind_leaves, err1, S, H) + + step2(A, v_cg, err0, err1) + + + +#######TRACEBACK +def find_h_Z_min(A, ind_leaves): + p = 1 + while True: + v = 2*p + w = 2*p +1 + if A[v-1] == 0: + p = v + else: + p = w + + if is_leaf(ind_leaves,p): + return p + return None + +def search_key(dico, val): + l = [c for c,v in dico.items() if v==val] + if len(l) != 0: + return l[0] + else: + return -1 + + +def find_highest_point(X, hh): + #find the point with highest rows + for x in X: + if x[0][1] == hh: + return x + +def find_index_point(X, xy): + for x in X: + if xy in x: + return X.index(x) + return None + +def traceback(A, X, H, ind_leaves, S, up): + #By going up in the steps, we can determine the breaking point that construct the regression + b = search_key(ind_leaves,find_h_Z_min(A, ind_leaves)) #We look for leaf with minimal Z value as it is the first point + h = H[b-1] + breakpoint = list() + for i in range(len(X)-1, -1, -1): + x = X[i] #ie point by decreasing order of the sorted array + xy, w, lab = x + cg = xy[1] #column + if h == cg:#h is a breaking point + h = S[i] #we take the h from the previous step + breakpoint.append(xy) + + if X[0][2] == int(up) and X[0][0][1] == H[-2]: #To avoid forgetting the first point if it's the highest in column + breakpoint.append(X[0][0]) + + hx = find_highest_point(X, H[-2]) #To avoid forgetting the last point if the highest + id_hx = X.index(hx) + if len(breakpoint)!=0: + id_hbp = find_index_point(X, breakpoint[0]) + if hx[2] == int(up) and id_hx < id_hbp: + breakpoint.append(hx[0]) + + return breakpoint + + +def labels_point(X, bpr, rev, up): + r_p = list() + b_p = list() + + reg_err = 0 + for x in X: + lab = x[2] + x = x[0] + + if up and x in bpr: + r_p.append(x) + if lab == 0: + reg_err +=1 + + elif not up and x in bpr: + b_p.append(x) + if lab == 1: + reg_err +=1 + + else: + if not rev and up: #CASE 1 + flag = 0 + for br in bpr: + if x[0] >= br[0] and x[1] >= br[1]: + flag = 1 + if flag == 0: + b_p.append(x) + if lab == 1: + reg_err +=1 + else: + r_p.append(x) + if lab == 0: + reg_err +=1 + + if rev and up: #CASE 2 + flag = 0 #consider as blue by default + for br in bpr: + if x[0] <= br[0] and x[1] >= br[1]: + flag = 1 + if flag == 0: + b_p.append(x) + if lab == 1: + reg_err +=1 + else: + r_p.append(x) + if lab == 0: + reg_err +=1 + + if not rev and not up: #CASE 3 + flag = 1 #consider as red by default + for br in bpr: + if x[0] >= br[0] and x[1] >= br[1]: + flag = 0 + if flag == 0: + b_p.append(x) + if lab == 1: + reg_err +=1 + else: + r_p.append(x) + if lab == 0: + reg_err +=1 + + if rev and not up: #CASE 4 + flag = 1 #consider as red by default + for br in bpr: + if x[0] <= br[0] and x[1] >= br[1]: + flag =0 + if flag == 0: + b_p.append(x) + if lab == 1: + reg_err +=1 + else: + r_p.append(x) + if lab == 0: + reg_err +=1 + + + return r_p, b_p, reg_err + + + + +#previous functions give us the data labelled as 0 and 1. But if we want +#to predict new points, we must know the official separation. As we prefer +#false positive rather than false negative, we draw the lines closest +#to "blue" points + +def breakpoint_b(X, b_p, rev, up): + bpb = list() + b_ps = sorted(b_p) + if not rev and up: #CASE 1 + while len(b_ps) != 0: + maxi = b_ps[-1] + x, y = maxi + bpb.append(maxi) + b_ps = [pt for pt in b_ps if pt[1] > y] + b_ps = sorted(b_ps) + + + elif rev and up: #CASE 2 + while len(b_ps) != 0: + maxi = b_ps[0] + x, y = maxi + bpb.append(maxi) + b_ps = [pt for pt in b_ps if pt[1] > y] + b_ps = sorted(b_ps) + + elif not rev and not up: #CASE 3 + while len(b_ps) != 0: + maxi = b_ps[0] + x, y = maxi + bpb.append(maxi) + b_ps = [pt for pt in b_ps if pt[0] > x] + b_ps = sorted(b_ps) + + elif rev and not up: #CASE 4 + while len(b_ps) != 0: + maxi = b_ps[-1] + x, y = maxi + bpb.append(maxi) + b_ps = [pt for pt in b_ps if pt[0] < x] + b_ps = sorted(b_ps) + return bpb + + +#Clean the list of border points + +def clean_blue(bpr, rev, up): + bpr = sorted(bpr) + nbpr = list() + + if rev and not up: + for bp in bpr: + itv = bpr[bpr.index(bp)+1:] + flag = True + for it in itv: + if it[1] <= bp[1]: + flag=False + break + if flag: + nbpr.append(bp) + + elif rev and up: + for bp in bpr: + itv = bpr[:bpr.index(bp)] + flag = True + for it in itv: + if it[1] >= bp[1]: + flag=False + break + if flag: + nbpr.append(bp) + + elif not rev and not up: + + for bp in bpr: + itv = bpr[:bpr.index(bp)] + flag = True + for it in itv: + if it[1] <= bp[1]: + flag=False + break + if flag: + nbpr.append(bp) + + elif not rev and up: + + for bp in bpr: + itv = bpr[bpr.index(bp)+1:] + flag = True + for it in itv: + if it[1] >= bp[1]: + flag=False + break + if flag: + nbpr.append(bp) + + assert len(nbpr) <= len(bpr) + return nbpr + + +def clean_red(bpr, rev, up): + bpr = sorted(bpr) + nbpr = list() + + if rev and up: + + for bp in bpr: + itv = bpr[bpr.index(bp)+1:] + flag = True + for it in itv: + if it[1] <= bp[1]: + flag=False + break + if flag: + nbpr.append(bp) + + elif rev and not up: + + for bp in bpr: + itv = bpr[:bpr.index(bp)] + flag = True + for it in itv: + if it[1] >= bp[1]: + flag=False + break + if flag: + nbpr.append(bp) + + elif not rev and up: + + for bp in bpr: + itv = bpr[:bpr.index(bp)] + flag = True + for it in itv: + if it[1] <= bp[1]: + flag=False + break + if flag: + nbpr.append(bp) + + elif not rev and not up: + + for bp in bpr: + itv = bpr[bpr.index(bp)+1:] + flag = True + for it in itv: + if it[1] >= bp[1]: + flag=False + break + if flag: + nbpr.append(bp) + + assert len(nbpr) <= len(bpr) + return nbpr + + + +#####MAIN FUNCTION + +def compute_recursion(data, case = None): + #return X : sorted data + # labels : labelisation of each points + # bpb : points creating the delimitation of blue area + # bpr : points creating the delimitation of red area + + models = {} + if case is None: + + for rev in [True, False]: + for up in [True, False]: + + X, H, A, ind_leaves = initialization(data, rev) + + S = list() + labs = [x[2] for x in X] + nb_leaves = len(H) + + for i in range(len(X)): #((r,c), w, label) + x = X[i] + xy, w, lab = x + cg = xy[1] + err0, err1 = err(lab, w, abs(1-int(up))), err(lab, w, int(up)) + + recursion(A, H, cg, ind_leaves, err0, err1, nb_leaves, S) + + while not is_A_balanced(A): + for v in range(len(A), -1, -1): + rebalance(A,v, ind_leaves) + + if up: + bpr = traceback(A, X, H, ind_leaves, S, up) + r_p, b_p, reg_err = labels_point(X, bpr, rev, up) + + bpb = breakpoint_b(X, b_p, rev, up) + + else: + bpb = traceback(A, X, H, ind_leaves, S, up) + + r_p, b_p, reg_err = labels_point(X, bpb, rev, up) + bpr = breakpoint_b(X, r_p, rev, up) + + bpr = clean_red(bpr, rev, up) + bpb = clean_blue(bpb, rev, up) + + if rev and up: + models[2] = (reg_err, bpr, bpb, r_p, b_p) + elif not rev and up: + models[1] = (reg_err, bpr, bpb, r_p, b_p) + elif not rev and not up: + models[3] = (reg_err, bpr, bpb, r_p, b_p) + else: + models[4] = (reg_err, bpr, bpb, r_p, b_p) + + else: + #print('case {}'.format(case)) + rev, up = case[0], case[1] + X, H, A, ind_leaves = initialization(data, rev) + S = list() + labs = [x[2] for x in X] + nb_leaves = len(H) + #print(X) + for i in range(len(X)): #((r,c), w, label) + x = X[i] + xy, w, lab = x + cg = xy[1] + err0, err1 = err(lab, w, abs(1-int(up))), err(lab, w, int(up)) + + recursion(A, H, cg, ind_leaves, err0, err1, nb_leaves, S) + + while not is_A_balanced(A): + for v in range(len(A), -1, -1): + rebalance(A,v, ind_leaves) + + if up: + bpr = traceback(A, X, H, ind_leaves, S, up) + r_p, b_p, reg_err = labels_point(X, bpr, rev, up) + + bpb = breakpoint_b(X, b_p, rev, up) + + else: + bpb = traceback(A, X, H, ind_leaves, S, up) + r_p, b_p, reg_err = labels_point(X, bpb, rev, up) + bpr = breakpoint_b(X, r_p, rev, up) + + bpr = clean_red(bpr, rev, up) + bpb = clean_blue(bpb, rev, up) + + models[case[2]] = (reg_err, bpr, bpb, r_p, b_p) + + return X, models + + +#### PREDICTION For diff case + +def predict_uncertainty(p, bpr, bpb, rev, up): + flag = -1 + if not rev and up: #CASE 1 + for b in bpr: + if p[0] >= b[0] and p[1] >= b[1]: + flag = 1 + + for b in bpb: + if p[0] <= b[0] and p[1] <= b[1]: + flag = 0 + + + elif rev and up: #CASE 2 + for b in bpr: + if p[0] <= b[0] and p[1] >= b[1]: + flag = 1 + + for b in bpb: + if p[0] >= b[0] and p[1] <= b[1]: + flag = 0 + + elif not rev and not up: #CASE 3 + for b in bpr: + if p[0] <= b[0] and p[1] <= b[1]: + flag = 1 + + for b in bpb: + if p[0] >= b[0] and p[1] >= b[1]: + flag = 0 + + + elif rev and not up: #CASE 4 + for b in bpr: + if p[0] >= b[0] and p[1] <= b[1]: + flag = 1 + + for b in bpb: + if p[0] <= b[0] and p[1] >= b[1]: + flag = 0 + assert flag in [1, 0, -1], "Problem with prediction of the point, label is not 0, 1 or -1" + + return flag + + + +def predict_severe(p, bpr, bpb, rev, up): + # points in grey area are automatically labelled in red area + flag = predict_uncertainty(p, bpr, bpb, rev, up) + if flag == -1: + flag = 1 + + assert flag in [1, 0], "Problem with prediction of the point, label is not 0 or 1" + + + return flag + + +def predict_non_severe(p, bpr, bpb, rev, up): + # points in grey area are automatically labelled in blue area + flag = predict_uncertainty(p, bpr, bpb, rev, up) + if flag == -1: + flag = 0 + + assert flag in [1, 0], "Problem with prediction of the point, label is not 0 or 1" + + return flag + + + +#def predict_severe(p, bpr, bpb, rev, up): + # points in grey area are automatically labelled in red area +# flag = 1 +# if not rev and up: #CASE 1 +# for b in bpb: +# if p[0] <= b[0] and p[1] <= b[1]: +# flag = 0 + + +# elif rev and up: #CASE 2 +# for b in bpb: +# if p[0] >= b[0] and p[1] <= b[1]: +# flag = 0 + +# elif not rev and not up: #CASE 3 +# for b in bpb: +# if p[0] >= b[0] and p[1] >= b[1]: +# flag = 0 + + +# elif rev and not up: #CASE 4 +# for b in bpb: +# if p[0] <= b[0] and p[1] >= b[1]: +# flag = 0 +# return flag + + +#def predict_non_severe(p, bpr, bpb, rev, up): + # points in grey area are automatically labelled in blue area +# flag = 0 +# if not rev and up: #CASE 1 +# for b in bpr: +# if p[0] >= b[0] and p[1] >= b[1]: +# flag = 1 + + +# elif rev and up: #CASE 2 +# for b in bpr: +# if p[0] <= b[0] and p[1] >= b[1]: +# flag = 1 + +# elif not rev and not up: #CASE 3 +# for b in bpr: +# if p[0] <= b[0] and p[1] <= b[1]: +# flag = 1 + +# elif rev and not up: #CASE 4 +# for b in bpr: +# if p[0] >= b[0] and p[1] <= b[1]: +# flag = 1 + +# return flag diff --git a/Module/optimal_k_aggregations.py b/Module/optimal_k_aggregations.py new file mode 100755 index 0000000000000000000000000000000000000000..fe0c4504e7531b46aba97b2ec1d275cd48168112 --- /dev/null +++ b/Module/optimal_k_aggregations.py @@ -0,0 +1,151 @@ +from Module import cost_matrix_uncertainty as cmu +from Module import monotonic_regression_uncertainty as mru +from Module import selection_algorithm as sa +from Module import tools + + + +import time +import pandas as pd +import multiprocessing as mp +import copy +import numpy as np +import matplotlib.pyplot as plt + + + +def find_k_metamodel(df, ndf, cost, k, nbcpus, strat): + l = list() + mini = 1 + for s in strat: + p, mve = s(df, ndf, cost, k, nbcpus) + if mve<mini: + mini = mve + l.append((mve, p, s.__name__)) + potential = [L for L in l if L[0] == mini] + potential.sort(key=lambda x:x[2], reverse = True) + return potential[0] + + +#Functions that given some classifiers, construct the associated ensemble model and predict the output for a single patient + +def prediction_pairs(df, out, pair, funct): + p1, p2, key = pair.split('/') + key = int(key) + rev, up = tools.equiv_key_case(key) + + tr1, tr2 = df[p1].values.tolist(), df[p2].values.tolist() + diag = df['target'].values.tolist() + + data = [((tr1[n], tr2[n] ), 1, diag[n]) for n in range(len(diag))] + out_p = (out[p1], out[p2]) + + X, models = mru.compute_recursion(data, (rev, up, key)) + reg_err, bpr, bpb, r_p, b_p = models[key] + pred = funct(out_p, bpr, bpb, rev, up) # + return pred + + +def create_and_predict_metamodel(df_, out, pairs, nbcpus, funct): + #Return the majority vote prediction (pred_metamodel) and the probability of the being severe (nb of severe/total nb) + pool = mp.Pool(nbcpus) + df = copy.deepcopy(df_) + + vals = [(df, out, p, funct) for p in pairs] + + preds = pool.starmap(prediction_pairs, vals, max(1,len(vals)//nbcpus)) #Get all the predictions made by the different pairs for a single patient + + del df + return tools.pred_metamodel(preds), tools.proba_metamodel(preds) + + + +##Big loop to compute the average error for ensemble contaning k classifiers + + +def k_missclassification(df, cls, nbcpus, funct, strat, min_k, max_k, log): + print('k misclassification : {}\n'.format(funct)) + + k_mis = {k : list() for k in range(3, max_k)} #Store for each value of k, whereas patients were misclassified or not with an ensemble of k classifiers + + #pairs_err = {cl : list() for cl in cls} #For each classifiers we are going to store the average misclassification error (computed with LOOCV) made with each patients + + + for j in range(len(df)): + t1 = time.time() + f = open(log, 'a') + f.write('Patient {} \n'.format(j)) + f.close() + out = df.iloc[j, :] + df_2 = df.drop([j]) + df_2.reset_index(drop=True, inplace=True) + + ndf_err = cmu.error_matrix(df_2, cls, nbcpus,funct) + + + f = open(log, 'a') + f.write('cost matrix computed \n') + f.close() + + + cost = cmu.cost_classifiers(ndf_err) + + + if min_k == 1: + #k = 1 : As a first pair, we take one with the lowest error + temp = min(cost.values()) + res = [key for key in cost.keys() if cost[key] == temp] #Many classifiers can have the lowest error + pair = [res[0]] #We take one arbitrary + + + pred, proba = create_and_predict_metamodel(df_2, out, pair, nbcpus, funct) + if proba != -1: + k_mis[1].append(abs(out['target']-pred)) + else: #case of proba = -1 means that all the classifiers in the metamodel predicted the point as uncertain + print('case of unknown point in oka') + + min_k +=1 + + #k > 1: ensemble classifiers + for k in range(min_k, max_k): + t0 = time.time() + mve, pairs, algo = find_k_metamodel(df_2, ndf_err, cost, k, nbcpus, strat) + pred, proba = create_and_predict_metamodel(df_2, out, pairs, nbcpus, funct) + + if proba != -1: + k_mis[k].append(abs(out['target']-pred)) + else: #case of proba = -1 means that all the classifiers in the metamodel predicted the point as uncertain + print('case of unknown point in oka') + + f = open(log, 'a') + f.write('Creation metamodel for k = {} in {}s \n'.format(k, time.time()-t0)) + f.close() + + f = open(log, 'a') + f.write('End patient in {}s \n'.format(time.time() - t1)) + f.close() + + + k_error = {k : np.mean(k_mis[k]) for k in range(min_k, max_k)} + + return k_error + + + +def optimal_k(k_error): + mini = min(k_error.values()) + keys = [k for k in k_error.keys() if k_error[k] == mini] + return min(keys), mini + + +def print_k_error(k_error, file): + x, y = k_error.keys(), k_error.values() + plt.figure() + plt.plot(x,y) + plt.title('Average error for ensembles of classifiers') + plt.xlabel('Nb of classifiers') + plt.ylabel('Average error') + if file: + plt.savefig(file) + else: + plt.show() diff --git a/Module/preselection.py b/Module/preselection.py new file mode 100644 index 0000000000000000000000000000000000000000..caffa4d1d8179d42d7cbc36e1f88ad84231aa130 --- /dev/null +++ b/Module/preselection.py @@ -0,0 +1,59 @@ +from Module import cost_matrix_uncertainty as cmu +from Module import optimal_k_aggregations as oka +from Module import monotonic_regression_uncertainty as mru +from Module import tools +from Module import selection_algorithm as sa + +import multiprocessing as mp + +import pandas as pd +import matplotlib.pyplot as plt +import os + + +def all_configurations(df): + transcripts = list(df.columns) + transcripts.remove('target') + + configurations = list() + for i in range(len(transcripts)): + for j in range(i+1, len(transcripts)): + for key in range(1,5): + configurations.append('/'.join([transcripts[i], transcripts[j], str(key)])) + return configurations + + +def single_score(cl, df): + p1, p2, key = cl.split('/') + key = int(key) + rev, up = tools.equiv_key_case(key) + + diag = df['target'].to_list() + tr1, tr2 = df[p1].to_list(), df[p2].to_list() + data = [((tr1[i], tr2[i]), 1, diag[i]) for i in range(len(diag))] + + + X, m = mru.compute_recursion(data, (rev, up, key)) + reg_err, bpr, bpb, r_p, b_p = m[key] + + return (cl, reg_err) + +def regression_error_score(df, cls, nbcpus): + pool = mp.Pool(nbcpus) + + vals = [(cl, df) for cl in cls] + res = pool.starmap(single_score, vals, max(1, len(vals)//nbcpus)) + + dico = {r[0] : r[1] for r in res} + s = pd.Series(dico) + s.name = 'regression error' + return s + +def regression_error_matrix(df, nbcpus): + config = all_configurations(df) + reg = regression_error_score(df, config, nbcpus) + return reg + + +def preselection_reg_err(reg_err_mat, threshold): + return reg_err_mat[reg_err_mat<threshold] diff --git a/Module/selection_algorithm.py b/Module/selection_algorithm.py new file mode 100755 index 0000000000000000000000000000000000000000..84716056a479f22549770fc154c8df323393bc68 --- /dev/null +++ b/Module/selection_algorithm.py @@ -0,0 +1,161 @@ +from Module import cost_matrix_uncertainty as cmu +from Module import measures as ms + +import copy +import multiprocessing as mp +import time + + +#Filter the matrix: eliminate redundant transcripts + +def filter_pairs_greedy(pairs): + used_trans = list() + delete_col = list() + for i in range(len(pairs)): + p1, p2, key = pairs[i].split("/") + if p1 in used_trans or p2 in used_trans: + delete_col.append(pairs[i]) + else: + used_trans.append(p1) + used_trans.append(p2) + for x in delete_col: + pairs.remove(x) + return pairs + + + +def filter_pairs_adapt(pairs, cl): + delete_pairs = list() + idx = pairs.index(cl) + for i in range(idx, len(pairs)): + p1, p2, key = pairs[i].split("/") + if p1 in cl or p2 in cl: + delete_pairs.append(pairs[i]) + for x in delete_pairs: + pairs.remove(x) + return pairs + + +#########N-best + + + +def NB(df, ndf_, cost, k, nbcpus, mes = ms.MVE): + + ndf = copy.deepcopy(ndf_) + ndf.drop(['uncertain', 'error'], inplace=True) + + pairs = sorted(cost.items(), key=lambda t: t[1]) + pairs = [pairs[i][0] for i in range(len(pairs))] + pairs = filter_pairs_greedy(pairs) + pairs = pairs[0:k] + + set = [ndf[p].values.tolist() for p in pairs] + + return pairs, ms.MVE(set) + + +##### Forward search + + +def test_candidate_FS(cand_pairs, cls, ndf, mes, i): + cp = cand_pairs[i] + + candidate_set_pairs = [ndf[cl].values.tolist() for cl in cls] + candidate_set_pairs.append(ndf[cp].values.tolist()) + cp_ms = mes(candidate_set_pairs) + + return (i, cp, cp_ms) + + + +def FS(df, ndf_, cost, k, nbcpus, mes = ms.MVE, jump = 30): + pool = mp.Pool(nbcpus) + + ndf = copy.deepcopy(ndf_) + ndf.drop(['uncertain', 'error'], inplace=True) + + temp = min(cost.values()) + res = [key for key in cost.keys() if cost[key] == temp] #Many classifiers can have the lowest error + cl = res[0] #We take one arbitrary + cls = [cl] #We start with an ensemble of k=1 classifiers + + pairs = sorted(cost.items(), key=lambda t: t[1]) + pairs = [pairs[i][0] for i in range(len(pairs))] + + ind = 1 + tot_ind = ind + + while len(cls) < k: #We add classifiers until we reach k + #Condition if we reach the end of the list of classifiers, we start again from the begining by eliminating the already used cls + if tot_ind + jump > len(pairs): + pairs = [pairs[p] for p in range(len(pairs)) if pairs[p] not in cls] + ind = 1 + tot_ind = ind + + cand_pairs = pairs[ind:ind+jump] + + vals = [(cand_pairs, cls, ndf, mes, i) for i in range(len(cand_pairs))] + + res = pool.starmap(test_candidate_FS, vals, max(1,len(vals)//nbcpus)) + res.sort(key=lambda x:x[2]) + + i, cp, cp_ms = res[0] + best_cp_ms = cp_ms + best_cand = cp + ind = tot_ind + i +1 + tot_ind = ind + + + cls.append(best_cand) + + pairs = filter_pairs_adapt(pairs, best_cand) + + + set = [ndf[p].values.tolist() for p in cls] + + return cls, ms.MVE(set) + + +#### Backward Search + + +def test_candidate_BS(cand_pairs, cls, ndf, mes, i): + cp = cand_pairs[i] + + candidate_set_pairs = [ndf[cl].values.tolist() for cl in cls if cl != cp] + + cp_ms = mes(candidate_set_pairs) + + return (i, cp, cp_ms) + + +def BS(df, ndf_, cost, k, nbcpus, mes = ms.F2, end = 30): + + pool = mp.Pool(nbcpus) + ndf = copy.deepcopy(ndf_) + ndf.drop(['uncertain', 'error'], inplace=True) + + + pairs = sorted(cost.items(), key=lambda t: t[1]) + pairs = [pairs[i][0] for i in range(len(pairs))] + pairs = filter_pairs_greedy(pairs) + cls = pairs[:end] + + while len(cls) > k: + + cand_pairs = cls.copy() + + vals = [(cand_pairs, cls, ndf, mes, i) for i in range(len(cand_pairs))] + + res = pool.starmap(test_candidate_BS, vals, max(1,len(vals)//nbcpus)) + + res.sort(key=lambda x:x[2]) + + i, cp, cp_ms = res[0] + cls.remove(cp) + + + set = [ndf[p].values.tolist() for p in cls] + + return cls, ms.MVE(set) diff --git a/Module/show_results_4cases.py b/Module/show_results_4cases.py new file mode 100755 index 0000000000000000000000000000000000000000..f83fc86d1914c8f9311a16d0cd660cfb8b62c5af --- /dev/null +++ b/Module/show_results_4cases.py @@ -0,0 +1,498 @@ +from Module import monotonic_regression_uncertainty as mru +from Module import tools + +import multiprocessing as mp + +import matplotlib.pyplot as plt +from matplotlib import patches +import colorsys +import random + +from math import log + + + + +def cr_models(p, df): + p1, p2, key = p.split('/') + key = int(key) + rev, up = tools.equiv_key_case(key) + tr1 = df[p1].values.tolist() + tr2 = df[p2].values.tolist() + diag = df['target'].values.tolist() + data = [((tr1[n], tr2[n] ), 1, diag[n]) for n in range(len(diag))] + + X, models = mru.compute_recursion(data, (rev, up, key)) + + return p1, p2, models, data + + +def print_model(data, models, p1, p2, df1, pathname = None, cm=None): + #print the monotonic space with the 3 areas + + if '.1.1' in p1: + p1 = p1[:-2] + if '.1.1' in p2: + p2 = p2[:-2] + + try: + g1 = df1[df1['Probeset_ID'] == p1]['Gene.Symbol'].values.tolist()[0] + except: + g1 = p1 + try: + g2 = df1[df1['Probeset_ID'] == p2]['Gene.Symbol'].values.tolist()[0] + except: + g2 = p2 + + for key in models.keys(): + key = int(key) + #print('Key', key) + plt.figure(figsize=(3,3)) + ax = plt.axes() + ax.set_facecolor("lightgray") + + x_r = list() + y_r = list() + x_b = list() + y_b = list() + for i in range(len(data)): + xy, w, lab = data[i] + x, y = xy + if lab == 0: #blue + x_r.append(x) + y_r.append(y) + else: #red + x_b.append(x) + y_b.append(y) + + reg_err, bpr, bpb, r_p, b_p = models[key] + + + min_x = min(min(x_r), min(x_b))-0.05 + min_y = min(min(y_r), min(y_b))-0.05 + max_x = max(max(x_r), max(x_b))+0.05 + max_y = max(max(y_r), max(y_b))+0.05 + + for bp in bpb: + x, y = bp + if key == 1: + ax.add_artist(patches.Rectangle((min_x, min_y), abs(x-min_x), abs(y-min_y), facecolor = 'lightskyblue', zorder = 1)) + elif key == 2: + ax.add_artist(patches.Rectangle((x, min_y), max_x+abs(x), abs(y-min_y), facecolor = 'lightskyblue', zorder = 1)) + elif key == 3: + ax.add_artist(patches.Rectangle((x, y), max_x+abs(x), max_y+abs(y), facecolor = 'lightskyblue', zorder = 1)) + else: + ax.add_artist(patches.Rectangle((min_x, y ), abs(x-min_x), max_y+abs(y), facecolor = 'lightskyblue', zorder = 1)) + + + for bp in bpr: + x, y = bp + + + if key == 1: + ax.add_artist(patches.Rectangle((x, y), max_x+abs(x), max_y+abs(y), facecolor ='lightcoral', zorder = 1)) + elif key == 2: + ax.add_artist(patches.Rectangle((min_x, y ), abs(x-min_x), max_y+abs(y), facecolor = 'lightcoral', zorder = 1)) + elif key == 3: + ax.add_artist(patches.Rectangle((min_x, min_y), abs(x-min_x), abs(y-min_y), facecolor = 'lightcoral', zorder = 1)) + else: + ax.add_artist(patches.Rectangle((x, min_y), max_x+abs(x), abs(y-min_y), facecolor = 'lightcoral', zorder = 1)) + + + #plt.scatter(x_r, y_r, c = 'blue', marker='.', zorder = 2) + #plt.scatter(x_b, y_b, c = 'red', marker='.', zorder = 2) + + random.shuffle(data) + + for d in data: + if d[2] == 0: + plt.scatter(d[0][0], d[0][1], c = 'blue', marker='.', zorder = 2) + elif d[2] == 1: + plt.scatter(d[0][0], d[0][1], c = 'red', marker='.', zorder = 2) + plt.xlabel(g1) + plt.ylabel(g2) + + if cm is not None: + pair = '/'.join([p1, p2, str(key)]) + error = cm.at['error', pair] + plt.title('LOOCV Error {}'.format(error)) + + if pathname is not None: + plt.savefig(pathname + g1 + '_' + g2 + '.png') + + f = open(pathname + 'gene.txt', 'a') + f.write('{} : {}\n'.format(g1, p1)) + f.write('{} : {}\n'.format(g2, p2)) + f.close() + else: + plt.show() + + + +def print_model_log(data, models, p1, p2, df1, pathname = None): + #print the monotonic space with the 3 areas + + data = [((log(d[0][0]), log([0][1])), d[1], d[2]) for d in data] + + if '.1.1' in p1: + p1 = p1[:-2] + if '.1.1' in p2: + p2 = p2[:-2] + + try: + g1 = df1[df1['Probeset_ID'] == p1]['Gene.Symbol'].values.tolist()[0] + except: + g1 = p1 + try: + g2 = df1[df1['Probeset_ID'] == p2]['Gene.Symbol'].values.tolist()[0] + except: + g2 = p2 + + for key in models.keys(): + key = int(key) + #print('Key', key) + plt.figure(figsize=(3,3)) + ax = plt.axes() + ax.set_facecolor("lightgray") + + x_r = list() + y_r = list() + x_b = list() + y_b = list() + for i in range(len(data)): + xy, w, lab = data[i] + x, y = xy + if lab == 0: #blue + x_r.append(x) + y_r.append(y) + else: #red + x_b.append(x) + y_b.append(y) + + reg_err, bpr, bpb, r_p, b_p = models[key] + + for bp in bpb: + x, y = bp + if key == 1: + ax.add_artist(patches.Rectangle((0.0, 0.0), x, y, facecolor = 'lightskyblue', zorder = 1)) + elif key == 2: + ax.add_artist(patches.Rectangle((x, 0), 1000, y, facecolor = 'lightskyblue', zorder = 1)) + elif key == 3: + ax.add_artist(patches.Rectangle((x, y), 1000, 1000, facecolor = 'lightskyblue', zorder = 1)) + else: + ax.add_artist(patches.Rectangle((0, y ), x, 1000, facecolor = 'lightskyblue', zorder = 1)) + + + for bp in bpr: + x, y = bp + + + if key == 1: + ax.add_artist(patches.Rectangle((x, y), 1000, 1000, facecolor ='lightcoral', zorder = 1)) + elif key == 2: + ax.add_artist(patches.Rectangle((0, y ), x, 1000, facecolor = 'lightcoral', zorder = 1)) + elif key == 3: + ax.add_artist(patches.Rectangle((0.0, 0.0), x, y, facecolor = 'lightcoral', zorder = 1)) + else: + ax.add_artist(patches.Rectangle((x, 0), 1000, y, facecolor = 'lightcoral', zorder = 1)) + + + #plt.scatter(x_r, y_r, c = 'blue', marker='.', zorder = 2) + #plt.scatter(x_b, y_b, c = 'red', marker='.', zorder = 2) + + random.shuffle(data) + + for d in data: + if d[2] == 0: + plt.scatter(d[0][0], d[0][1], c = 'blue', marker='.', zorder = 2) + elif d[2] == 1: + plt.scatter(d[0][0], d[0][1], c = 'red', marker='.', zorder = 2) + plt.xlabel(g1) + plt.ylabel(g2) + + if pathname is not None: + plt.savefig(pathname + g1 + '_' + g2 + '.png') + + f = open(pathname + 'gene.txt', 'a') + f.write('{} : {}\n'.format(g1, p1)) + f.write('{} : {}\n'.format(g2, p2)) + f.close() + else: + plt.show() + + + +def print_model_out(data, out, models, p1, p2, df1, pathname = None): + #print the monotonic space with the 3 areas + + if '.1.1' in p1: + p1 = p1[:-2] + if '.1.1' in p2: + p2 = p2[:-2] + + try: + g1 = df1[df1['Probeset_ID'] == p1]['Gene.Symbol'].values.tolist()[0] + except: + g1 = p1 + try: + g2 = df1[df1['Probeset_ID'] == p2]['Gene.Symbol'].values.tolist()[0] + except: + g2 = p2 + + for key in models.keys(): + key = int(key) + print('Key', key) + plt.figure(figsize=(5,5)) + ax = plt.axes() + ax.set_facecolor("lightgray") + + x_r = list() + y_r = list() + x_b = list() + y_b = list() + for i in range(len(data)): + xy, w, lab = data[i] + x, y = xy + if lab == 0: #blue + x_r.append(x) + y_r.append(y) + else: #red + x_b.append(x) + y_b.append(y) + + reg_err, bpr, bpb, r_p, b_p = models[key] + + for bp in bpb: + x, y = bp + if key == 1: + ax.add_artist(patches.Rectangle((0.0, 0.0), x, y, facecolor = 'lightskyblue', zorder = 1)) + elif key == 2: + ax.add_artist(patches.Rectangle((x, 0), 1000, y, facecolor = 'lightskyblue', zorder = 1)) + elif key == 3: + ax.add_artist(patches.Rectangle((x, y), 1000, 1000, facecolor = 'lightskyblue', zorder = 1)) + else: + ax.add_artist(patches.Rectangle((0, y ), x, 1000, facecolor = 'lightskyblue', zorder = 1)) + + + for bp in bpr: + x, y = bp + + + if key == 1: + ax.add_artist(patches.Rectangle((x, y), 1000, 1000, facecolor ='lightcoral', zorder = 1)) + elif key == 2: + ax.add_artist(patches.Rectangle((0, y ), x, 1000, facecolor = 'lightcoral', zorder = 1)) + elif key == 3: + ax.add_artist(patches.Rectangle((0.0, 0.0), x, y, facecolor = 'lightcoral', zorder = 1)) + else: + ax.add_artist(patches.Rectangle((x, 0), 1000, y, facecolor = 'lightcoral', zorder = 1)) + + + #plt.scatter(x_r, y_r, c = 'blue', zorder = 2) + #plt.scatter(x_b, y_b, c = 'red', zorder = 2) + + random.shuffle(data) + + for d in data: + if d[2] == 0: + plt.scatter(d[0][0], d[0][1], c = 'blue', marker='.', zorder = 2) + elif d[2] == 1: + plt.scatter(d[0][0], d[0][1], c = 'red', marker='.', zorder = 2) + plt.scatter(out[0], out[1], c = 'g', zorder = 2) + plt.xlabel(g1) + plt.ylabel(g2) + + if pathname is not None: + plt.savefig(pathname + g1 + '_' + g2 + '.png') + + f = open(pathname + 'gene.txt', 'a') + f.write('{} : {}\n'.format(g1, p1)) + f.write('{} : {}\n'.format(g2, p2)) + f.close() + else: + plt.show() + + + +def rank_h(x, D, rev): + h = [d[0] for d in D] + h = sorted(h, reverse=rev) + r = h.index(x) + + if rev: + return len(D) - r + 1 + else: + return r+1 + +def rank_v(x, D, up): + h = [d[1] for d in D] + h.sort() + r = h.index(x) + #if up: + return r+1 + #else: + # return len(D) - r + 1 + + + + + + + +def print_model_RS(data, models, p1, p2, df1, pathname = None, cm=None): + D_hat = [x[0] for x in data] + + + plt.figure(figsize=(3,3)) + ax = plt.axes() + ax.set_facecolor("lightgray") + plt.title('Ranking Space') + plt.xlabel(p1) + plt.ylabel(p2) + + for key in models.keys(): + key = int(key) + + rev, up = tools.equiv_key_case(key) + + reg, bpr, bpb, rps, bps = models[key] + + + + for bp in bpb: + x, y = bp + rxd = rank_h(x, D_hat, rev) + ryd = rank_v(y, D_hat, up) + + if key == 1: + ax.add_artist(patches.Rectangle((0.0, 0.0), rxd, ryd, facecolor = 'lightskyblue', zorder = 1)) + elif key == 2: + ax.add_artist(patches.Rectangle((rxd, 0), 1000, ryd, facecolor = 'lightskyblue', zorder = 1)) + elif key == 3: + ax.add_artist(patches.Rectangle((rxd, ryd), 1000, 1000, facecolor = 'lightskyblue', zorder = 1)) + else: + ax.add_artist(patches.Rectangle((0, ryd ), rxd, 1000, facecolor = 'lightskyblue', zorder = 1)) + + + for bp in bpr: + x, y = bp + rxd = rank_h(x, D_hat, rev) + ryd = rank_v(y, D_hat, up) + + if key == 1: + ax.add_artist(patches.Rectangle((rxd, ryd), 1000, 1000, facecolor ='lightcoral', zorder = 1)) + elif key == 2: + ax.add_artist(patches.Rectangle((0, ryd ), rxd, 1000, facecolor = 'lightcoral', zorder = 1)) + elif key == 3: + ax.add_artist(patches.Rectangle((0.0, 0.0), rxd, ryd, facecolor = 'lightcoral', zorder = 1)) + else: + ax.add_artist(patches.Rectangle((rxd, 0), 1000, ryd, facecolor = 'lightcoral', zorder = 1)) + + random.shuffle(data) + for d in data: + rxd = rank_h(d[0][0], D_hat, rev) + ryd = rank_v(d[0][1], D_hat, up) + if d[2] == 1: + plt.scatter(rxd, ryd, c='red', marker='.', zorder=2) + elif d[2] == 0: + plt.scatter(rxd, ryd, c='blue', marker='.', zorder=2) + + plt.show() + +def print_severe(data, models, p1, p2, df1, pathname = None): + #print the monotonic space with the 3 areas + + if '.1.1' in p1: + p1 = p1[:-2] + if '.1.1' in p2: + p2 = p2[:-2] + + try: + g1 = df1[df1['Probeset_ID'] == p1]['Gene.Symbol'].values.tolist()[0] + except: + g1 = p1 + try: + g2 = df1[df1['Probeset_ID'] == p2]['Gene.Symbol'].values.tolist()[0] + except: + g2 = p2 + + for key in models.keys(): + key = int(key) + print('Key', key) + plt.figure(figsize=(5,5)) + ax = plt.axes() + ax.set_facecolor("lightcoral") + + x_r = list() + y_r = list() + x_b = list() + y_b = list() + for i in range(len(data)): + xy, w, lab = data[i] + x, y = xy + if lab == 0: #blue + x_r.append(x) + y_r.append(y) + else: #red + x_b.append(x) + y_b.append(y) + + reg_err, bpr, bpb, r_p, b_p = models[key] + + for bp in bpb: + x, y = bp + if key == 1: + ax.add_artist(patches.Rectangle((0.0, 0.0), x, y, facecolor = 'lightskyblue', zorder = 1)) + elif key == 2: + ax.add_artist(patches.Rectangle((x, 0), 1000, y, facecolor = 'lightskyblue', zorder = 1)) + elif key == 3: + ax.add_artist(patches.Rectangle((x, y), 1000, 1000, facecolor = 'lightskyblue', zorder = 1)) + else: + ax.add_artist(patches.Rectangle((0, y ), x, 1000, facecolor = 'lightskyblue', zorder = 1)) + + + #for bp in bpr: + # x, y = bp + + + # if key == 1: + # ax.add_artist(patches.Rectangle((x, y), 1000, 1000, facecolor ='lightcoral', zorder = 1)) + # elif key == 2: + # ax.add_artist(patches.Rectangle((0, y ), x, 1000, facecolor = 'lightcoral', zorder = 1)) + # elif key == 3: + # ax.add_artist(patches.Rectangle((0.0, 0.0), x, y, facecolor = 'lightcoral', zorder = 1)) + # else: + # ax.add_artist(patches.Rectangle((x, 0), 1000, y, facecolor = 'lightcoral', zorder = 1)) + + + random.shuffle(data) + + for d in data: + if d[2] == 0: + plt.scatter(d[0][0], d[0][1], c = 'blue', marker='.', zorder = 2) + elif d[2] == 1: + plt.scatter(d[0][0], d[0][1], c = 'red', marker='.', zorder = 2) + plt.xlabel(g1) + plt.ylabel(g2) + + if pathname is not None: + plt.savefig(pathname + g1 + '_' + g2 + '.png') + + f = open(pathname + 'gene.txt', 'a') + f.write('{} : {}\n'.format(g1, p1)) + f.write('{} : {}\n'.format(g2, p2)) + f.close() + else: + plt.show() + + + +def show_results(df, probs_df, pairs, nbcpus, pathname, cm): + pool = mp.Pool(nbcpus) + vals = [(p, df) for p in pairs] + + res = pool.starmap(cr_models, vals, max(1,len(vals)//nbcpus) ) + + for r in res: + p1, p2, models, data = r + print_model(data, models, p1, p2, probs_df, pathname, cm) diff --git a/Module/stages.py b/Module/stages.py new file mode 100755 index 0000000000000000000000000000000000000000..24ab16b211d792e7beffe4156d7682c48f5d2e37 --- /dev/null +++ b/Module/stages.py @@ -0,0 +1,82 @@ +from Module import cost_matrix_uncertainty as cmu +from Module import optimal_k_aggregations as oka +from Module import monotonic_regression_uncertainty as mru +from Module import tools +from Module import selection_algorithm as sa +from Module import preselection as psl + +import pandas as pd +import matplotlib.pyplot as plt +import os + + +from sklearn.metrics import roc_auc_score, confusion_matrix, matthews_corrcoef + + +def stage0(df, nbcpus, threshold): + reg = psl.regression_error_matrix(df, nbcpus) + if threshold is not None: + reg = psl.preselection_reg_err(reg, threshold) + return reg + + + + +def stage_1(df, cls, min_k, max_k, nbcpus, strat, funct, log): + k_error = oka.k_missclassification(df, cls, nbcpus, funct, strat, min_k, max_k, log) + k_opt, err_k = oka.optimal_k(k_error) + return k_opt, k_error + + + +def stage_2(df, cls, k_opt, auc_file, conf_mat_file, nbcpus, funct, strat, logs): + errors, y_true, y_pred, y_proba = list(), list(), list(), list() + + for i in range(len(df)): + out = df.iloc[i, :] + df_2 = df.drop([i]) + df_2.reset_index(drop=True, inplace=True) + f = open(logs, 'a') + f.write('Patient {} \n'.format(i)) + f.close() + + ndf_err = cmu.error_matrix(df_2, cls, nbcpus,funct) + cost = cmu.cost_classifiers(ndf_err) + + f = open(logs, 'a') + f.write('Matrix computed \n') + f.close() + + mve, pairs, algo = oka.find_k_metamodel(df_2, ndf_err, cost, k_opt, nbcpus, strat) + pred, proba = oka.create_and_predict_metamodel(df_2, out, pairs, nbcpus, funct) + + f = open(logs, 'a') + f.write('Pred={}, Proba={} \n'.format(pred, proba)) + f.close() + + errors.append(abs(out['target']-pred)) + y_proba.append(proba) + y_true.append(out['target']) + y_pred.append(pred) + + + acc = errors.count(0)/len(errors) + auc = tools.auc_score(y_proba, y_true, auc_file) + auc2 = roc_auc_score(y_true, y_proba) + CI = tools.confidence_interval(auc, y_true) + conf_mat = confusion_matrix(y_true, y_pred) + mcc = matthews_corrcoef(y_true, y_pred) + + f = open(logs, 'a') + f.write('acc={}, auc={}, CI = {}, aucsk = {}, confusion_matrix = {}, mcc = {} \n'.format(acc, auc, CI, auc2, conf_mat, mcc)) + f.close() + + labels, probas, uncertain_pts = tools.unclassified_points(y_true, y_proba) + return acc, auc, CI, conf_mat + +def stage_3(df, cls, k_opt, nbcpus, funct, strat): + ndf_err = cmu.error_matrix(df, cls, nbcpus,funct) + ndf_err.to_csv('cm_st3.csv') + cost = cmu.cost_classifiers(ndf_err) + mve, pairs, algo = oka.find_k_metamodel(df, ndf_err, cost, k_opt, nbcpus, strat) + return pairs, mve diff --git a/Module/tools.py b/Module/tools.py new file mode 100755 index 0000000000000000000000000000000000000000..273b80dc2abe117989f6c1f5a1732c84feb50ea8 --- /dev/null +++ b/Module/tools.py @@ -0,0 +1,186 @@ +import numpy as np +import matplotlib.pyplot as plt +from copy import deepcopy +import seaborn as sn + + +def equiv_key_case(i): + equiv = {1 : (False, True), 2: (True, True), 3:(False, False), 4:(True, False)} + return equiv[i] + + + + +def equiv_case_key(rev): + equiv = {(False, True):1, (True, True):2, (False, False):3, (True, False):4} + return equiv[rev] + + +#PREDICTION FROM A METAMODEL +def pred_metamodel(preds): + #preds is an array with all the predicted labels from the classifiers belongig to the meta model + a = preds.count(1) + b = preds.count(0) + c = preds.count(-1) + assert a + b +c == len(preds), "Problem in the error predictions for a patient" + if c == len(preds): #In the case where all the predictions are equal to -1, we predict the patient as 1 + return 1 + if a < b:# If less 1 than 0, predict as 0 + return 0 + else: #If more or same nb of 1 as nb of 0, predict as 1 + return 1 + + +# Prediction probability of being severe +def proba_metamodel(preds): + #preds is an array with all the predicted labels from the classifiers belongig to the meta model + a = preds.count(1) + b = preds.count(0) + c = preds.count(-1) + assert a + b +c == len(preds), "Problem in the probabilities for a patient" + if a+b != 0: + return a/(a+b) + else: + return -1 + + +def get_index_positions(list_of_elems, element): + index_pos_list = [] + index_pos = 0 + while True: + try: + index_pos = list_of_elems.index(element, index_pos) + # Add the index position in list + index_pos_list.append(index_pos) + index_pos += 1 + except ValueError as e: + break + return index_pos_list + +def unclassified_points(labels, probas): + #print('probas {}'.format(probas)) + un_pts = get_index_positions(probas, -1) + #print('unclassified_points {}'.format(un_pts)) + new_labels = deepcopy(labels) + new_probas = deepcopy(probas) + for i in range(len(probas)): + if i not in un_pts: + new_labels.append(labels[i]) + new_probas.append(probas[i]) + return new_labels, new_probas, un_pts + + + + + +####AUC score +def perf_metrics(y_actual, y_hat,threshold): + tp = 0 + fp = 0 + tn = 0 + fn = 0 + + for i in range(len(y_hat)): + if(y_hat[i] >= threshold): + if(y_actual[i] == 1): + tp += 1 + else: + fp += 1 + elif(y_hat[i] < threshold): + if(y_actual[i] == 0): + tn += 1 + else: + fn += 1 + tpr = tp/(tp+fn) + fpr = fp/(tn+fp) + + return [fpr,tpr] + + + + +def auc_score(probas, labels, auc_file=None): + thresholds = np.arange(0.0, 1.01, 0.001) + + roc_points = [] + for threshold in thresholds: + rates = perf_metrics(labels, probas, threshold) + roc_points.append(rates) + + fpr_array = [] + tpr_array = [] + for i in range(len(roc_points)-1): + point1 = roc_points[i]; + point2 = roc_points[i+1] + tpr_array.append([point1[0], point2[0]]) + fpr_array.append([point1[1], point2[1]]) + + auc3 = sum(np.trapz(tpr_array,fpr_array))+1 + assert auc3 <= 1 and auc3 >= 0, "AUC score is not in [0,1]" + print('Area under curve={}'.format(auc3)) + + CI = confidence_interval(auc3, labels) + + if auc_file is not None: + plt.plot(tpr_array, fpr_array, linestyle='--', color='darkorange', lw = 2, label='ROC curve', clip_on=False) + plt.plot([0, 1], [0, 1], color='navy', linestyle='--') + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.0]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('AUC={}, CI=[{};{}] '.format(round(auc3,3), CI[0], CI[1])) + plt.legend(loc="lower right") + plt.savefig(auc_file) + return auc3 + + +def confidence_interval(auc, labels): + labels = list(labels) + N1 = labels.count(1) + N2 = labels.count(0) + Q1 = auc/(2-auc) + Q2 = (2*(auc**2))/(1+auc) + SE = np.sqrt((auc*(1-auc)+(N1-1)*(Q1-auc**2)+(N2-1)*(Q2-auc**2))/(N1*N2)) + low_b = auc - 1.96*SE + high_b = auc + 1.96*SE + if low_b < 0: + low_b = 0 + if high_b > 1: + high_b = 1 + return [round(low_b, 3), round(high_b,3)] + + + +def confusion_matrix(y_real, y_pred, conf_mat_file): + tp = 0 + fp = 0 + tn = 0 + fn = 0 + + for i in range(len(y_real)): + if y_real[i] == 1: + if y_pred[i] == 1: + tp += 1 + else: + fn += 1 + elif y_real[i] == 0: + if y_pred[i] == 0: + tn += 1 + else: + fp += 1 + + conf_mat = [[tn, fn], [fp, tp]] + + if conf_mat_file is not None: + df_cm = pd.DataFrame(array, range(6), range(6)) + plt.figure(figsize=(5,5)) + sn.set(font_scale=1.4) + sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) + plt.xlabel('True labels') + plt.ylabel('Output labels') + plt.title('Confusion matrix') + plt.legend() + plt.savefig(conf_mat_file) + + + return conf_mat