From f28feeee4346e4c03dd281cc00a5d245845e15c8 Mon Sep 17 00:00:00 2001
From: hanna julienne <hanna.julienne@pasteur.fr>
Date: Mon, 18 Nov 2019 11:42:28 +0100
Subject: [PATCH] corrected problem when the same file is used for several GWAS

---
 doc/source/index.rst           |  5 +++--
 jass_preprocessing/__main__.py | 20 +++++++++++++++-----
 jass_preprocessing/map_gwas.py |  6 ++----
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/doc/source/index.rst b/doc/source/index.rst
index 733fd0b..83387d8 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -70,7 +70,7 @@ Input
 |  1  |14930| rs75454623 |  A  |  G  | 0.482228|
 +-----+-----+------------+-----+-----+---------+
 
-* Folder containing all raw gwas data (all chromosomes in one file) (minimal conditions?? tab separated?)
+* Folder containing all raw gwas data : (all chromosomes in one file) (minimal conditions?? tab separated?)
 * a list containing the name of GWAS file to the string format.
 * A descriptor csv files that will described each GWAS summary statistic files:
 
@@ -84,7 +84,7 @@ Input
 +===========================================+============================================================+
 |             path to the data              |                            filename                        |
 +-------------------------------------------+------------------------------------------------------------+
-|            study info fields              | consortia,outcome,fullName,type,Nsample,Ncase,Ncontrol,Nsnp|
+|            study info fields              | Consortium,Outcome,fullName,type,Nsample,Ncase,Ncontrol,Nsnp|
 +-------------------------------------------+------------------------------------------------------------+
 |    names of the header in the GWAS file   |      snpid,a1,a2,freq,pval,n,z,OR,se,code,imp,ncas,ncont   |
 +-------------------------------------------+------------------------------------------------------------+
@@ -92,6 +92,7 @@ Input
 .. Give an example
 .. |               I don't know                 |                          altNcas,altNcont|
 
+Note that the combination of Consortium and outcome must be unique because it will be used as an index in the cleaning process.
 
 Here is an example of descriptor field, the field irrelevant (for example odd ratio for continuous trait) for the study must be filled with na. 
 
diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py
index 112b75f..2127191 100644
--- a/jass_preprocessing/__main__.py
+++ b/jass_preprocessing/__main__.py
@@ -21,22 +21,32 @@ import argparse
 #| pathOUT | **unused in main_preprocessing.py**  | netPath+'PCMA/1._DATA/RAW.summary/'|
 #| ImpG_output_Folder | main ouput folder | netPath+ 'PCMA/1._DATA/preprocessing_test/' |
 
+def raise_duplicated_index(tag):
+    duplicated_index = tag.duplicated()
+    raise ValueError("'Consortium_Outcome' are duplicated for: {0}".format(duplicated_index))
 
 def launch_preprocessing(args):
     """
     Preprocessing GWAS dataset
     """
     gwas_map = pd.read_csv(args.gwas_info, sep="\t")
-    gwas_map.set_index("filename", inplace=True)
 
-    for gwas_filename in gwas_map.index:
-        tag = "{0}_{1}".format(gwas_map.loc[gwas_filename, 'Consortium'],
-                               gwas_map.loc[gwas_filename, 'Outcome'])
+    #define an unique
+    gwas_map['tag'] = gwas_map.Consortium+ "_" + D.Outcome
+
+    if gwas_map.tag.duplicated().any():
+        raise_duplicated_index(gwas_map.tag)
+
+    gwas_map.set_index("tag", inplace=True)
+
+    for tag in gwas_map.index:
+
+        gwas_filename = D.loc[tag, "filename"]
 
         print('processing GWAS: {}'.format(tag))
         start = time.time()
         GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2]
-        mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info)
+        mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag])
 
         gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
 
diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py
index 3cf1a5c..901f609 100644
--- a/jass_preprocessing/map_gwas.py
+++ b/jass_preprocessing/map_gwas.py
@@ -76,21 +76,19 @@ def convert_missing_values(df):
     return df.replace(def_missing, nan_vec)
 
 
-def map_columns_position(gwas_internal_link,  GWAS_labels):
+def map_columns_position(gwas_internal_link,  my_labels):
     """
     Find column position for each specific Gwas
 
     Args:
         gwas_internal_link (str): filename of the GWAS data (with path)
-        GWAS_labels (str): filename of the csv information file
+        GWAS_labels (pd.DataFrame): corresponding row of the information file
 
     Return:
         pandas Series with column position and column names as index
     """
 
-    column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
 
-    column_dict.set_index("filename", inplace=True)
     print(gwas_internal_link)
     gwas_file = gwas_internal_link.split('/')[-1]
     my_labels = column_dict.loc[gwas_file]
-- 
GitLab