diff --git a/bin/extract_sample_size.py b/bin/extract_sample_size.py new file mode 100644 index 0000000000000000000000000000000000000000..cae2b7d8fa39d72780e7a0fc6155a63338ee5d85 --- /dev/null +++ b/bin/extract_sample_size.py @@ -0,0 +1,14 @@ +import pandas as pd +import sys + + +f = sys.argv[1] +meta_data_loc = sys.argv[2] +d = f.split('.')[0] + +D = pd.read_csv(meta_data_loc, sep="\t") +D.index = "z_"+D.Consortium+"_"+D.Outcome + + +Nsample = D.loc[d, "Nsample"] +print Nsample diff --git a/bin/generate_trait_pairs.py b/bin/generate_trait_pairs.py new file mode 100644 index 0000000000000000000000000000000000000000..5fa5dcb7bd08e8fbb1b960d9aec86035b44ae0ca --- /dev/null +++ b/bin/generate_trait_pairs.py @@ -0,0 +1,24 @@ +import glob +from itertools import combinations +import os + +cwd = os.getcwd() +print(cwd) +L = glob.glob('{}/*.gz'.format(cwd)) +L_s = [j.split('/')[-1] for j in L] +L_combi = [",".join(map(str, comb)) for comb in combinations(L_s, 2)] +print(L_s) +size_chunk = 75 + + +N_files = len(L_combi)//size_chunk +i=0 + +for i in range(N_files+1): + start = i*size_chunk + end = (i+1)*size_chunk + if i < N_files: + print(";".join(L_combi[start:end]),file=open("pairs_chunk_{0}.txt".format(i), "w")) + else: + if len(L_combi[start:]) > 0: + print(";".join(L_combi[start:]),file=open("pairs_chunk_{0}.txt".format(i), "w")) diff --git a/bin/parse_correlation_results.py b/bin/parse_correlation_results.py new file mode 100644 index 0000000000000000000000000000000000000000..9da81706e099682564817c3553b8c31e89616dab --- /dev/null +++ b/bin/parse_correlation_results.py @@ -0,0 +1,86 @@ +import re +import pandas as pd +import glob +import numpy as np + +print("Parsing_correlation") +file_input = glob.glob("*.log") +print(file_input) + +def get_trait(fi): + traits = re.search('([-0-9A-Z]+_[-0-9A-Z-]+)-_-([-0-9A-Z]+_[-0-9A-Z-]+).log', fi) + return [traits.group(1), traits.group(2)] + +traits = [get_trait(fi) for fi in file_input] # trait for trait in +traits = list(set([trait for trait_pair in traits for trait in trait_pair])) # fla +print(traits) +traits_ind = 'z_' + pd.Series(traits) + +### Create matrices: +Covariance_matrix_H0 = pd.DataFrame(index=traits_ind, columns=traits_ind) +Covariance_matrix_genetic = pd.DataFrame(index=traits_ind, columns=traits_ind) +Correlation_matrix_genetic = pd.DataFrame(index=traits_ind, columns=traits_ind) + +for i1, t1 in enumerate(traits): + for t2 in traits[(i1+1):]: + print(t1,t2) + f=0 + flip=False + try: + cov_fi = "{0}-_-{1}.log".format(t1, t2) + fi = open(cov_fi, "r") + f=1 + except FileNotFoundError: + try: + cov_fi = "{0}-_-{1}.log".format(t2, t1) + fi = open(cov_fi, "r") + f=1 + flip=True + except FileNotFoundError: + print("Not found") + print(t1, t2) + f=0 + pass + + if f==1: + print("PARSING") + print(cov_fi) + L = fi.readlines() + #retrieve Intercept + L_intercept = list(filter(lambda x:re.match("Intercept:", x)!=None , L)) + L_gencov = list(filter(lambda x:re.match('Total Observed scale gencov', x)!=None , L)) + L_gencor = list(filter(lambda x:re.match('Genetic Correlation', x)!=None , L)) + L_h2 = list(filter(lambda x:re.match('Total Observed scale h2', x)!=None , L)) + + t1_col = "z_" + t1 + t2_col = "z_" + t2 + + if len(L_intercept)==3: + Covariance_matrix_H0.loc[t1_col, t2_col] = float(L_intercept[2].split(" ")[1]) + Covariance_matrix_H0.loc[t2_col, t1_col] = float(L_intercept[2].split(" ")[1]) + + Covariance_matrix_genetic.loc[t1_col, t2_col] = float(L_gencov[0].split(":")[1].split(" ")[1]) + Covariance_matrix_genetic.loc[t2_col, t1_col] = float(L_gencov[0].split(":")[1].split(" ")[1]) + + Correlation_matrix_genetic.loc[t1_col, t1_col] = 1 + Correlation_matrix_genetic.loc[t2_col, t2_col] = 1 + print(L_gencor) + Correlation_matrix_genetic.loc[t1_col, t2_col] = float(L_gencor[1].split(":")[1].split(" ")[1]) + Correlation_matrix_genetic.loc[t2_col, t1_col] = float(L_gencor[1].split(":")[1].split(" ")[1]) + if flip: + Covariance_matrix_H0.loc[t1_col, t1_col] = float(L_intercept[1].split(" ")[1]) + Covariance_matrix_H0.loc[t2_col, t2_col] = float(L_intercept[0].split(" ")[1]) + + Covariance_matrix_genetic.loc[t1_col, t1_col] = float(L_h2[1].split(":")[1].split(" ")[1]) + Covariance_matrix_genetic.loc[t2_col, t2_col] = float(L_h2[0].split(":")[1].split(" ")[1]) + + else: + Covariance_matrix_H0.loc[t1_col, t1_col] = float(L_intercept[0].split(" ")[1]) + Covariance_matrix_H0.loc[t2_col, t2_col] = float(L_intercept[1].split(" ")[1]) + Covariance_matrix_genetic.loc[t1_col, t1_col] = float(L_h2[0].split(":")[1].split(" ")[1]) + Covariance_matrix_genetic.loc[t2_col, t2_col] = float(L_h2[1].split(":")[1].split(" ")[1]) + +Covariance_matrix_genetic.to_csv("Covariance_matrix_genetic.csv", sep="\t") +Covariance_matrix_H0.to_csv("Covariance_matrix_H0.csv", sep="\t") + +print("Parsing_correlation")