Commit f28feeee authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

corrected problem when the same file is used for several GWAS

parent 823976b5
Pipeline #18402 passed with stages
in 53 seconds
......@@ -70,7 +70,7 @@ Input
| 1 |14930| rs75454623 | A | G | 0.482228|
+-----+-----+------------+-----+-----+---------+
* Folder containing all raw gwas data (all chromosomes in one file) (minimal conditions?? tab separated?)
* Folder containing all raw gwas data : (all chromosomes in one file) (minimal conditions?? tab separated?)
* a list containing the name of GWAS file to the string format.
* A descriptor csv files that will described each GWAS summary statistic files:
......@@ -84,7 +84,7 @@ Input
+===========================================+============================================================+
| path to the data | filename |
+-------------------------------------------+------------------------------------------------------------+
| study info fields | consortia,outcome,fullName,type,Nsample,Ncase,Ncontrol,Nsnp|
| study info fields | Consortium,Outcome,fullName,type,Nsample,Ncase,Ncontrol,Nsnp|
+-------------------------------------------+------------------------------------------------------------+
| names of the header in the GWAS file | snpid,a1,a2,freq,pval,n,z,OR,se,code,imp,ncas,ncont |
+-------------------------------------------+------------------------------------------------------------+
......@@ -92,6 +92,7 @@ Input
.. Give an example
.. | I don't know | altNcas,altNcont|
Note that the combination of Consortium and outcome must be unique because it will be used as an index in the cleaning process.
Here is an example of descriptor field, the field irrelevant (for example odd ratio for continuous trait) for the study must be filled with na.
......
......@@ -21,22 +21,32 @@ import argparse
#| pathOUT | **unused in main_preprocessing.py** | netPath+'PCMA/1._DATA/RAW.summary/'|
#| ImpG_output_Folder | main ouput folder | netPath+ 'PCMA/1._DATA/preprocessing_test/' |
def raise_duplicated_index(tag):
duplicated_index = tag.duplicated()
raise ValueError("'Consortium_Outcome' are duplicated for: {0}".format(duplicated_index))
def launch_preprocessing(args):
"""
Preprocessing GWAS dataset
"""
gwas_map = pd.read_csv(args.gwas_info, sep="\t")
gwas_map.set_index("filename", inplace=True)
for gwas_filename in gwas_map.index:
tag = "{0}_{1}".format(gwas_map.loc[gwas_filename, 'Consortium'],
gwas_map.loc[gwas_filename, 'Outcome'])
#define an unique
gwas_map['tag'] = gwas_map.Consortium+ "_" + D.Outcome
if gwas_map.tag.duplicated().any():
raise_duplicated_index(gwas_map.tag)
gwas_map.set_index("tag", inplace=True)
for tag in gwas_map.index:
gwas_filename = D.loc[tag, "filename"]
print('processing GWAS: {}'.format(tag))
start = time.time()
GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2]
mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info)
mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag])
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
......
......@@ -76,21 +76,19 @@ def convert_missing_values(df):
return df.replace(def_missing, nan_vec)
def map_columns_position(gwas_internal_link, GWAS_labels):
def map_columns_position(gwas_internal_link, my_labels):
"""
Find column position for each specific Gwas
Args:
gwas_internal_link (str): filename of the GWAS data (with path)
GWAS_labels (str): filename of the csv information file
GWAS_labels (pd.DataFrame): corresponding row of the information file
Return:
pandas Series with column position and column names as index
"""
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
column_dict.set_index("filename", inplace=True)
print(gwas_internal_link)
gwas_file = gwas_internal_link.split('/')[-1]
my_labels = column_dict.loc[gwas_file]
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment