diff --git a/jass/models/worktable.py b/jass/models/worktable.py index ebda182f5d4ec489ad291347f0ec2ba5ea13210a..4840f8fe19fd90109e2029d24677842a1479e25b 100644 --- a/jass/models/worktable.py +++ b/jass/models/worktable.py @@ -35,23 +35,23 @@ def signif(x, digit): """ signif Round a number x to represent it with <digit> digits - + :param x: the number to round :type x: float - + :param digit: the number of digits :type digit: int - + :return: the rounded number :rtype: float - + example: >>> signif(1.2345678, 1) 1.0 - + >>> signif(1.2345678, 3) 1.23 - + >>> signif(1.2345678, 5) 1.2346 """ @@ -95,7 +95,7 @@ def choose_stat_function(smart_na_computation, optim_na, big, function_name, sta def add_signif_status_column(region_sub_tab, significance_treshold=5*10**-8): region_sub_tab["signif_status"] = "" - + # blue: significant pvalues for omnibus and univariate tests cond = np.where((region_sub_tab.JASS_PVAL < significance_treshold) & ( region_sub_tab.UNIVARIATE_MIN_PVAL < significance_treshold))[0] @@ -129,7 +129,7 @@ def get_region_summary(sum_stat_tab, phenotype_ids, significance_treshold=5*10** # add minimum univariate p-value univar = sum_stat_tab.groupby("Region").min().UNIVARIATE_MIN_PVAL region_sub_tab.loc[univar.index, "UNIVARIATE_MIN_PVAL"] = univar.values - + # Tag SNPs depending on which test is significant region_sub_tab.reset_index(inplace=True) region_sub_tab = add_signif_status_column( @@ -137,7 +137,7 @@ def get_region_summary(sum_stat_tab, phenotype_ids, significance_treshold=5*10** # reorder columns region_sub_tab = region_sub_tab[['Region', "MiddlePosition", "snp_ids", "CHR", "position", - "Ref_allele", "Alt_allele", "JASS_PVAL", "UNIVARIATE_MIN_PVAL", + "Ref_allele", "Alt_allele", "JASS_PVAL", "UNIVARIATE_MIN_PVAL", "signif_status"] + phenotype_ids] return region_sub_tab @@ -236,25 +236,25 @@ def create_worktable_file( :type pos_Start: str :param pos_End: end of the position of the studied region (base point) for local analysis :type pos_End: str - + :return: Number of chunks used to write workTable file. \ This information is useful for reading the file :rtype: int """ # number of phenotypes beyond which we change the algorithm K_NB_PHENOTYPES_BIG = 18 - + # Upper bound of the chromosome length (bp) K_POS_MAX = 250000000 - + # Minimum and maximum limit of regions for each chromosome (multiples of 50) Min_pos_chr = [ 0, 100, 250, 400, 500, 600, 700, 800, 900, 1000, 1050, 1150, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1650] Max_pos_chr = [150, 300, 400, 550, 650, 750, 850, 950, 1050, 1100, 1200, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1700, 1750] - + N_pheno = len(phenotype_ids) - + # Controls the number of phenotypes if (N_pheno > 64): print("ERROR: {} phenotypes are selected. \nThe current version of JASS cannot analyze more than 64 phenotypes" \ @@ -262,7 +262,7 @@ def create_worktable_file( raise ValueError("Maximum number of phenotypes exceeded") elif (N_pheno >= 20): print("WARNING: {} phenotypes are selected. The computation will be very long!".format(N_pheno)) - + if (chromosome is None): local_analysis = False print("============== Whole genome analysis ===============") @@ -274,7 +274,7 @@ def create_worktable_file( raise ValueError("create_worktable_file: the required argument chromosome is not a number") else: num_Chr = int(chromosome) - + if ((pos_Start is None) and (pos_End is None)): chromosome_full = True print("------ Chromosome : {} ------".format(num_Chr)) @@ -285,9 +285,9 @@ def create_worktable_file( if ((pos_End is None) or (not pos_End.isdigit())): pos_End = K_POS_MAX print("------ Chromosome : {} ({} - {}) ------".format(num_Chr, pos_Start, pos_End)) - + print("Phenotypes = {}".format(phenotype_ids)) - + # Initialization of Jass_progress progress_path = os.path.join(os.path.dirname( project_hdf_path), "JASS_progress.txt") @@ -301,11 +301,11 @@ def create_worktable_file( if os.path.exists(project_hdf_path): os.remove(project_hdf_path) hdf_work = HDFStore(project_hdf_path) - + if csv_file is not None: if os.path.exists(csv_file): os.remove(csv_file) - + if delayed_gen_csv_file: # setting a lock to generate the csv_file asynchronously the_lock_path = os.path.join(os.path.dirname(project_hdf_path), "the_lock.txt") @@ -329,31 +329,31 @@ def create_worktable_file( ) # Covariance matrix regions = read_hdf(init_file_path, "Regions").index.tolist() - sum_stat_tab_min_itemsizes = {"snp_ids": 50, "Region": 10, "CHR": 5} + sum_stat_tab_min_itemsizes = {"snp_ids": 80, "Region": 10, "CHR": 5} region_sub_table_min_itemsizes = { - "Region": 10, "index": 10, "CHR": 5, "snp_ids": 50, "signif_status": 20} + "Region": 10, "index": 10, "CHR": 5, "snp_ids": 50, "signif_status": 20, ,"Ref_allele" : 70, "Alt_allele":70} smart_na_computation = not (remove_nan) module_name, function_name = stat.split(":") stat_module = importlib.import_module(module_name) stat_fn = getattr(stat_module, function_name) - + if (N_pheno < K_NB_PHENOTYPES_BIG): big = False sub_cov_matrix = sub_cov else: big = True sub_cov_matrix = sub_cov.to_numpy() - - stat_compute = choose_stat_function(smart_na_computation, + + stat_compute = choose_stat_function(smart_na_computation, optim_na, big, function_name, - stat_fn, - sub_cov_matrix, + stat_fn, + sub_cov_matrix, samp_size=phenolist['Effective_sample_size'], **kwargs) - + # read data by chunks to optimize memory usage if (not local_analysis): Nchunk = len(regions) // chunk_size + 1 @@ -362,14 +362,14 @@ def create_worktable_file( chunk_size = 50 Nchunk = Max_pos_chr[num_Chr - 1] // chunk_size start_value = Min_pos_chr[num_Chr - 1] // chunk_size - + # selection criterion in the case of a partial analysis by chromosome and position if (chromosome_full): Local_criteria = "(CHR == {})".format(chromosome) else: Local_criteria = "(CHR == {}) and (position >= {}) and (position <= {})"\ .format(chromosome, pos_Start, pos_End) - + Nsnp_total = 0 Nsnp_jassed = 0 @@ -380,23 +380,23 @@ def create_worktable_file( binf = chunk * chunk_size bsup = (chunk+1) * chunk_size - + sum_stat_tab = read_hdf(init_file_path, 'SumStatTab', columns=[ 'Region', 'CHR', 'position', 'snp_ids', 'Ref_allele', 'Alt_allele', 'MiddlePosition'] + phenotype_ids, where='Region >= {0} and Region < {1}'.format(binf, bsup)) - + print("Regions {0} to {1}\r".format(binf, bsup)) - + if(local_analysis): # Data extraction in the case of a partial analysis sum_stat_tab = sum_stat_tab.query(Local_criteria) - + # Remake row index unique: IMPORTANT for assignation with .loc sum_stat_tab.dropna( axis=0, subset=phenotype_ids, how=how_dropna, inplace=True ) sum_stat_tab.reset_index(drop=True, inplace=True) - + if sum_stat_tab.shape[0] == 0: print( "No data available for region {0} to region {1}".format(binf, bsup)) @@ -410,48 +410,48 @@ def create_worktable_file( else: if not big: # Algorithm optimized for a small number of phenotypes - + # Sort SumStatTab by missing patterns patterns_missing, frequent_pattern = compute_frequent_missing_pattern( sum_stat_tab[phenotype_ids]) - - sum_stat_tab["patterns_missing"] = patterns_missing + + sum_stat_tab["patterns_missing"] = patterns_missing z1 = sum_stat_tab[phenotype_ids] - + # Apply the statistic computation by missing patterns for pattern in frequent_pattern: bool_serie = (patterns_missing == pattern) Selection_criteria = sum_stat_tab["patterns_missing"] == pattern - + try: sum_stat_tab.loc[bool_serie, "JASS_PVAL"] = stat_compute(z1[Selection_criteria], pattern) except ValueError: print("worktable") - + else: # Algorithm optimized for a high number of phenotypes - + # Sort SumStatTab by missing patterns patterns_missing, frequent_pattern, dico_index_y = \ compute_frequent_missing_pattern_Big(sum_stat_tab[phenotype_ids]) - + sum_stat_tab["index"] = sum_stat_tab.index.tolist() sum_stat_tab["patterns_missing"] = patterns_missing - - # In our case, the "apply" function transforms all numeric columns into float 64-bit encoded columns. - # However, the mantissa of a float is encoded on 52 bits while we use an integer code using 64 bits. - # Beyond 52 phenotypes, the automatic transformation integer-float induces a false pattern code. + + # In our case, the "apply" function transforms all numeric columns into float 64-bit encoded columns. + # However, the mantissa of a float is encoded on 52 bits while we use an integer code using 64 bits. + # Beyond 52 phenotypes, the automatic transformation integer-float induces a false pattern code. # Transforming our code into a string keeps the coding accuracy beyond 52 phenotypes up to 64 phenotypes. sum_stat_tab = sum_stat_tab.astype({"patterns_missing": str}) - + Liste_colonnes = ["index", "patterns_missing"] + phenotype_ids dico_z = {} dico_index_x = {} - - sum_stat_tab[Liste_colonnes].apply(lambda x: store_pattern(dico_z, dico_index_x, *x), axis=1) - + + sum_stat_tab[Liste_colonnes].apply(lambda x: store_pattern(dico_z, dico_index_x, *x), axis=1) + Retour_omnibus_bypattern = {} - + # Apply the statistic computation by missing patterns for pattern in frequent_pattern: try: @@ -462,15 +462,15 @@ def create_worktable_file( ) except ValueError: print("worktable") - + Retour_omnibus = [0.0 for i in range(sum_stat_tab.shape[0])] for pattern in frequent_pattern: for ligne, indice in enumerate(dico_index_x[pattern]): Retour_omnibus[int(indice)] = (Retour_omnibus_bypattern[pattern])[int(ligne)] - + sum_stat_tab["JASS_PVAL"] = Retour_omnibus - + Nsnp_jassed = Nsnp_jassed + sum_stat_tab.shape[0] sum_stat_tab.sort_values(by=["Region", "CHR"], inplace=True) @@ -496,8 +496,8 @@ def create_worktable_file( + phenotype_ids] if post_filtering: - sum_stat_tab = post_computation_filtering(sum_stat_tab) - + sum_stat_tab = post_computation_filtering(sum_stat_tab) + hdf_work.append( "SumStatTab", sum_stat_tab, min_itemsize=sum_stat_tab_min_itemsizes ) @@ -505,7 +505,7 @@ def create_worktable_file( if ((csv_file is not None) and (not delayed_gen_csv_file)): with open(csv_file, 'a') as f: sum_stat_tab.to_csv(f, header=f.tell()==0) - + region_sub_table = get_region_summary( sum_stat_tab, phenotype_ids, significance_treshold=significance_treshold) @@ -546,13 +546,13 @@ def create_worktable_file( ) summaryTable.columns = ["PhenoSignif", "NoPhenoSignif"] summaryTable.index = ["JOSTSignif", "NoJOSTSignif"] - + hdf_work = HDFStore(project_hdf_path) hdf_work.put( "summaryTable", summaryTable, format="table", data_columns=True ) # Summary Table (contigency table) hdf_work.close() - + return Nchunk @@ -561,7 +561,7 @@ def binary_code(*args): binary_code Generates the binary code of each pattern ensuring compatibility between Linux and Windows """ - Chaine = "" + Chaine = "" for valeur in args: Chaine += "{}".format(valeur) return int(Chaine, 2) @@ -572,19 +572,19 @@ def binary_code_Big(dico_index_y, *args): binary_code Generates the binary code of each pattern ensuring compatibility between Linux and Windows """ - Chaine = "" + Chaine = "" for valeur in args: Chaine += "{}".format(valeur) - + Codage = int(Chaine, 2) - + if (not (Codage in dico_index_y)): dico_index_y[Codage] = [] for indice, valeur in enumerate(args): if (valeur == 1): - dico_index_y[Codage].append(indice) - - return Codage + dico_index_y[Codage].append(indice) + + return Codage def store_pattern(dico_z, dico_index_x, *colonne): @@ -594,18 +594,18 @@ def store_pattern(dico_z, dico_index_x, *colonne): """ Index = int(colonne[0]) Codage = int(colonne[1]) - + if (not (Codage in dico_z)): dico_z[Codage] = [] dico_index_x[Codage] = [] - + dico_index_x[Codage].append(Index) - + new_line = [] for valeur in colonne[2:]: if not math.isnan(valeur): new_line.append(valeur) - + dico_z[Codage].append(new_line) @@ -614,10 +614,10 @@ def compute_frequent_missing_pattern(sum_stat_tab): Compute the frequency of missing pattern in the dataset """ Pheno_is_present = 1- sum_stat_tab.isnull() - + # The coding of patterns missing is not guaranteed if there are more than 64 phenotypes patterns_missing = Pheno_is_present[Pheno_is_present.columns].apply(lambda x: binary_code(*x), axis=1) - + pattern_frequency = patterns_missing.value_counts() / len(patterns_missing) n_pattern = pattern_frequency.shape[0] print("Number of pattern {}".format(n_pattern)) @@ -628,22 +628,22 @@ def compute_frequent_missing_pattern(sum_stat_tab): def compute_frequent_missing_pattern_Big(sum_stat_tab): """ Compute the frequency of missing pattern in the dataset - + """ dico_index_y = {} - + Pheno_is_present = 1- sum_stat_tab.isnull() - + # The coding of patterns missing is not guaranteed if there are more than 64 phenotypes patterns_missing = Pheno_is_present[Pheno_is_present.columns] \ .apply(lambda x: binary_code_Big(dico_index_y, *x), axis=1) - + pattern_frequency = patterns_missing.value_counts() / len(patterns_missing) n_pattern = pattern_frequency.shape[0] print("Number of pattern {}".format(n_pattern)) frequent_pattern = pattern_frequency.index.tolist() - + return patterns_missing, frequent_pattern, dico_index_y @@ -738,10 +738,10 @@ def get_worktable_local_manhattan_data(project_hdf_path: str, chromosome: str = columns=["Region", "CHR", "position", "snp_ids", "JASS_PVAL"], where=['Region='+str(region_int), 'CHR='+str(chromosome_int)]) - + dataframe = stringize_dataframe_region_chr(dataframe) dataframe = dataframe.sort_values("position") - + return dataframe.to_csv(index=False) @@ -770,7 +770,7 @@ def get_worktable_local_heatmap_data(project_hdf_path: str, chromosome: str = No chromosome_int = chromosome[3:] dataframe = read_hdf(project_hdf_path, "SumStatTab", where=['Region='+str(region_int), 'CHR='+str(chromosome_int)]) - + dataframe = stringize_dataframe_region_chr(dataframe) dataframe = dataframe.sort_values("position") dataframe.drop(["Region", "CHR", "position", "JASS_PVAL", "MiddlePosition", "UNIVARIATE_MIN_PVAL", @@ -784,28 +784,28 @@ def get_worktable_local_heatmap_data(project_hdf_path: str, chromosome: str = No pivoted_dataframe = pivoted_dataframe.reindex(column_order, axis=1) # TODO rework the selection to return 5000 snps in total, around the # region ;) - + return pivoted_dataframe.to_csv(index_label="ID") def create_genome_full_csv(project_hdf_path, csv_file, chunk_size=50, Nchunk=35): """ create_genome_full_csv - Write the genome_full.csv file - + Write the genome_full.csv file + :param project_hdf_path: path to the worktable file :type project_hdf_path: str - + :param csv_file: path to the genome_full file :type csv_file: str - + :param chunk_size: the size of the chunks of the initial data :type chunk_size: int - + :return: csv_file is available. :rtype: bool """ - + # path of the lock that indicates that the csv file is not available the_lock_path = os.path.join(os.path.dirname(project_hdf_path), "the_lock.txt") @@ -815,27 +815,26 @@ def create_genome_full_csv(project_hdf_path, csv_file, chunk_size=50, Nchunk=35) # An error occurred: the csv file must not exist if the lock is set # The existing csv file is deleted os.remove(csv_file) - + for chunk in range(Nchunk): print("indice de boucle = {}".format(chunk)) binf = chunk * chunk_size bsup = (chunk + 1) * chunk_size - + # read workTable.hdf5 - df_for_csv = read_hdf(project_hdf_path, "SumStatTab", + df_for_csv = read_hdf(project_hdf_path, "SumStatTab", where='Region >= {0} and Region < {1}'.format(binf, bsup)) - + # append the data to the csv file with open(csv_file, 'a') as f: df_for_csv.to_csv(f, header=f.tell() == 0) - + # The lock is deleted os.remove(the_lock_path) - + if (os.path.isfile(csv_file)): The_file_is_available = True else: The_file_is_available = False - - return The_file_is_available + return The_file_is_available