corrected problem when the same file is used for several GWAS

f28feeee · Hanna JULIENNE · 823976b5 · f28feeee · f28feeee · f28feeee
Commit f28feeee authored 5 years ago by Hanna JULIENNE
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -70,7 +70,7 @@ Input
 |  1  |14930| rs75454623 |  A  |  G  | 0.482228|
 +-----+-----+------------+-----+-----+---------+

-* Folder containing all raw gwas data (all chromosomes in one file) (minimal conditions?? tab separated?)
+* Folder containing all raw gwas data : (all chromosomes in one file) (minimal conditions?? tab separated?)
 * a list containing the name of GWAS file to the string format.
 * A descriptor csv files that will described each GWAS summary statistic files:

@@ -84,7 +84,7 @@ Input
 +===========================================+============================================================+
 |             path to the data              |                            filename                        |
 +-------------------------------------------+------------------------------------------------------------+
-|            study info fields              | consortia,outcome,fullName,type,Nsample,Ncase,Ncontrol,Nsnp|
+|            study info fields              | Consortium,Outcome,fullName,type,Nsample,Ncase,Ncontrol,Nsnp|
 +-------------------------------------------+------------------------------------------------------------+
 |    names of the header in the GWAS file   |      snpid,a1,a2,freq,pval,n,z,OR,se,code,imp,ncas,ncont   |
 +-------------------------------------------+------------------------------------------------------------+
@@ -92,6 +92,7 @@ Input
 .. Give an example
 .. |               I don't know                 |                          altNcas,altNcont|

+Note that the combination of Consortium and outcome must be unique because it will be used as an index in the cleaning process.

 Here is an example of descriptor field, the field irrelevant (for example odd ratio for continuous trait) for the study must be filled with na. 


--- a/jass_preprocessing/__main__.py
+++ b/jass_preprocessing/__main__.py
@@ -21,22 +21,32 @@ import argparse
 #| pathOUT | **unused in main_preprocessing.py**  | netPath+'PCMA/1._DATA/RAW.summary/'|
 #| ImpG_output_Folder | main ouput folder | netPath+ 'PCMA/1._DATA/preprocessing_test/' |

+def raise_duplicated_index(tag):
+    duplicated_index = tag.duplicated()
+    raise ValueError("'Consortium_Outcome' are duplicated for: {0}".format(duplicated_index))

 def launch_preprocessing(args):
    """
    Preprocessing GWAS dataset
    """
    gwas_map = pd.read_csv(args.gwas_info, sep="\t")
-    gwas_map.set_index("filename", inplace=True)

-    for gwas_filename in gwas_map.index:
-        tag = "{0}_{1}".format(gwas_map.loc[gwas_filename, 'Consortium'],
-                               gwas_map.loc[gwas_filename, 'Outcome'])
+    #define an unique
+    gwas_map['tag'] = gwas_map.Consortium+ "_" + D.Outcome
+
+    if gwas_map.tag.duplicated().any():
+        raise_duplicated_index(gwas_map.tag)
+
+    gwas_map.set_index("tag", inplace=True)
+
+    for tag in gwas_map.index:
+
+        gwas_filename = D.loc[tag, "filename"]

        print('processing GWAS: {}'.format(tag))
        start = time.time()
        GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2]
-        mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info)
+        mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag])

        gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)


--- a/jass_preprocessing/map_gwas.py
+++ b/jass_preprocessing/map_gwas.py
@@ -76,21 +76,19 @@ def convert_missing_values(df):
    return df.replace(def_missing, nan_vec)


-def map_columns_position(gwas_internal_link,  GWAS_labels):
+def map_columns_position(gwas_internal_link,  my_labels):
    """
    Find column position for each specific Gwas

    Args:
        gwas_internal_link (str): filename of the GWAS data (with path)
-        GWAS_labels (str): filename of the csv information file
+        GWAS_labels (pd.DataFrame): corresponding row of the information file

    Return:
        pandas Series with column position and column names as index
    """

-    column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')

-    column_dict.set_index("filename", inplace=True)
    print(gwas_internal_link)
    gwas_file = gwas_internal_link.split('/')[-1]
    my_labels = column_dict.loc[gwas_file]