diff --git a/prep/main_V2.py b/prep/main_V2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e993eb3af237784766cbbd82e9e162f9f74629ff
--- /dev/null
+++ b/prep/main_V2.py
@@ -0,0 +1,375 @@
+#!/usr/bin/env python3
+
+# This script was developped by Michel Fisun in the statistical genetics lab, Pasteur Paris, 
+# under the supervision of Hugues Aschard
+# This script was modified and deboged by Sarah Djebali and Zoe Gerber in IRSD, Toulouse
+
+
+# IMPORTS --------------------------------------------------------------------
+from sqlite3 import DatabaseError
+import pandas as pd
+import numpy as np
+import time
+from pathlib import Path as path
+from optparse import OptionParser
+import sys
+from multiprocess import Pool
+# ----------------------------------------------------------------------------
+
+
+# FUNCTIONS  -----------------------------------------------------------------
+# this function outputs a list of 22 dataframes indexed by the chr id
+# in fact the index of the list is a tuple (chrnumber, dataframe including the snps present in this chr)
+def ChromosomeSplitter(bank : str, separator : str, cname : str) -> "list[pd.DataFrame]" :    
+    """
+    data bank file name -> list[CHR1,CHR2,...,CHR22]
+    Splits Data bank into chromosomes contained in a dataframe and stored in a list
+    """
+    print("Starting GWAS dataset splitter...")
+    start_time = time.time()
+    chr_list = []  # tuple: (chrosome_number, [chromosome])
+
+    #reading data
+    print("\nReading data...")
+    data_bank = pd.read_csv(bank,index_col=False, sep=separator)
+    print("Data read !\n")
+
+    #temporary pandas DataFrame to store snp data corresponding to current chromosome
+    chr = pd.DataFrame(None)
+
+    #building chromosome files
+    for i in range(1,22+1):
+        print("Building chromosome %s file..." % i)
+        chr_list.append((i,data_bank[data_bank[cname] == i]))
+
+    print("Done splitting chromosmes !")
+    print("--- done splitting chromosomes in %s seconds ---\n" % (time.time() - start_time))
+    print("\nChromosomes generated :")
+    print([n for (n,chr) in chr_list])
+    print("\n\n")
+
+    return chr_list
+
+
+def SortPerPValue(chr : tuple, Phead : str) -> pd.DataFrame : 
+    """
+    Quicksorts (and writes) the SNP's of the i-th chromosome file, in ascending order according to the P-Value
+    from better to worst pvalue
+    """
+    start_time = time.time()
+    i,chromosome = chr
+    quicksorted = chromosome.sort_values(by=Phead, ascending=True, kind = 'quicksort', ignore_index=True)
+
+    print(f"--- chromosome {i} sorted in %s secondes ---\n" % (time.time() - start_time))
+
+    return quicksorted
+
+
+def SortPerPosition(l : pd.DataFrame, pos : str) -> pd.DataFrame : 
+    """
+    sorting one locus which is a dataframe by pos
+    """
+    start_time = time.time()
+    quicksorted = l.sort_values(by=pos, ascending=True, kind = 'quicksort', ignore_index=True)
+
+    print(f"--- locus sorted in %s secondes ---\n" % (time.time() - start_time))
+
+    return quicksorted    
+
+
+def LocusUnion(l1 : pd.DataFrame, l2 : pd.DataFrame,pos) -> pd.DataFrame :
+    """
+    returns union of 2 loci
+    """
+    quicksorted_l1 = l1.sort_values(by=pos, ascending=True, kind = 'quicksort', ignore_index=True)
+    quicksorted_l2 = l2.sort_values(by=pos, ascending=True, kind = 'quicksort', ignore_index=True)
+
+    return pd.merge(quicksorted_l1,quicksorted_l2, how="outer")
+
+
+def isIntersected(l1 : pd.DataFrame, l2 : pd.DataFrame) -> bool :
+    """
+    returns True if there is a common SNP in l1 and l2, returns False otherwise
+    """
+    return len(pd.merge(l1, l2, how="inner")) != 0
+
+
+def checkListLocus(liste : "list[tuple]", pos) -> "list(tuple)":
+    """
+    checking in the list of locus if there is no overlaping loci 
+    if there is, merge the loci and delete the second locus
+    """
+    list_locus_to_remove = []
+    new_liste = []
+ 
+    for locus_index in range(len(liste)): 
+        ii,current_locus = liste[locus_index]
+            
+        #if current_locus.equals(list_locus_to_remove[x][1].reindex(columns=current_locus.columns)) for x in range(len(list_locus_to_remove)):
+        if any(current_locus.equals(locus[1]) for locus in list_locus_to_remove):
+            # skip the current locus if it's marked for removal
+            continue  
+            
+        for locus_index2 in range(locus_index+1, len(liste)):
+            _,test_locus = liste[locus_index2]
+
+            print(f"\nii,current_locus : {ii}, {current_locus}")
+            print(f"\n_,test_locus : {_}, {test_locus}") 
+
+            if current_locus.equals(test_locus) == False and isIntersected(current_locus, test_locus):
+                print("Overlapping2\n")
+                liste[locus_index] = (ii, SortPerPosition(LocusUnion(current_locus, test_locus, pos),pos))
+                    
+                locus_to_remove : tuple = (ii, test_locus)
+                list_locus_to_remove.append(locus_to_remove)
+                    
+                print(f"list_locus_to_remove : {list_locus_to_remove}")
+                
+                #print(f"\nNew locus {locus_index} created after merging 2: {liste[locus_index]}\n")
+                print(f"liste of locus at this point2 : {liste}")
+
+            else:
+                print("No overlapping\n")
+
+
+    # remove elements of the list liste which do not match with the elements of the list list_locus_to_remove
+    # store the new tuples in a new list
+    new_liste = [x for x in liste if not any(y[1].equals(x[1]) for y in list_locus_to_remove)]
+   
+    return new_liste
+
+
+# this function is the most important of all (contains the intelligence of the whole process)
+# it is done for a given chromosome
+# chr is a tuple (chrid, dataframe with snps of the chr)
+# Phead and pos are the way pvalue and position are called in the header of the gwas file
+# outputs a list with as many elements as loci in the chromosome entered
+# the output is in fact a list of tuples where
+# each tuple 1st element is the chromosome id and tuple second element is the dataframe including the snps of the locus defined 
+def LocusList(chr : tuple, Phead : str, pos, kb, Pseuil) -> "list(tuple)":
+    """
+    returns a list of all locus in given chromosome
+
+    Splits SNP's in the data bank (in one chromosome):
+    After SNP's were sorted by SortPerPValue(), takes the first most significative SNP in the list, takes a region of +- kb number (500 by default)
+    around the SNP in the non-pv-sorted file (but sorted according to position) and writes them into a file.
+    Then, repeats the same process for the next most significative SNP if it is not already in the previous locus
+    """
+    start_time = time.time()
+    locus = pd.DataFrame(None)
+
+    # i is the chr number, chromosome is a dataframe with all the snps of chr i
+    i,chromosome = chr
+    #print(f"\nStarting splitting chromosome {i} into loci...")
+    print(f"\nliste de SNP du chr {i} non triÃ©e:\n {chromosome}\n")
+    
+    # sorted is a tuple (chrid, sorted dataframe of the snps of the chr)
+    sorted = SortPerPValue(chr,Phead)
+
+    print(f"liste de SNP du chr {i} triÃ©e par pvalue: \n{sorted}")
+
+    locus_nb = 0
+    test = True
+    liste = []
+    new_liste = []
+    # len(sorted) is the number of snp in the list, for one chr
+    for snp_index in range(len(sorted)):
+        test = True
+        #print(f"snp index: {snp_index} \n")
+        
+        # first, we only keep snp that have a significant pvalue
+        if sorted.iloc[snp_index][Phead] > float(Pseuil):
+            print(f"\nNo more significant SNP pvalues in the chromosome {i}\n")
+            break
+    
+        # the 1st time, line contains the whole line of the best SNP : line = (chrid,snp with best pvalue)
+        line = sorted[snp_index:snp_index+1]
+        #print(f"\n line : \n{line} \n")
+      
+        # pos_line extracts from line the position on chrid of the snp with the best pvalue
+        pos_line = int(line[pos])
+        
+        # kb_nb is the number of bases upstream and downstream we want to search around our best SNP
+        # kb_range is this computed interval (500kb +/- by default) around the position on chrid of the snp with the best pvalue
+        kb_nb = int(kb) * 1000
+        kb_range = range(pos_line - kb_nb, pos_line + kb_nb + 1)
+        
+        # new_locus is a tuple : (chrid , dataframe of snps at 500kb from +/- best pvalue snp)
+        # the dataframes are ordered by SNP position
+        new_locus : tuple = (i,chromosome.loc[chromosome[pos].isin(kb_range) & (chromosome[Phead] < 0.01)])
+
+        print(f"new locus :\n {new_locus}")
+    
+        #first time len(liste) is 0
+        if len(liste)==0:
+            liste.append(new_locus)
+            #count of the locus created
+            locus_nb = locus_nb + 1
+            print(f"\nCHR{i}locus{locus_nb} created !\n")
+            continue
+        # after the if we have only one locus in the list (one locus created)
+        
+        else:
+            for new_locus_index in range(len(liste)):
+                # _ and ii are the chrid (the same)
+                # prev_locus is the locus (list of SNP) we appened is the list of loci
+                # nnew_locus is the locus (list of SNP) following in the list
+                # new locus is the tuple containing (chrid, nnew_locus)
+                ii,prev_locus = liste[new_locus_index]   
+                _,nnew_locus = new_locus
+
+                print(f"\nii,prev_locus : {ii}, {prev_locus}")
+                print(f"\n_,nnew_locus : {_}, {nnew_locus}")     
+        
+                if isIntersected(prev_locus, nnew_locus):
+                    print("Overlapping loci found !\n")
+                    print("Merging the loci...")
+                    liste[new_locus_index] = (ii, SortPerPosition(LocusUnion(prev_locus, nnew_locus, pos),pos))
+                    print("Done merging the loci !\n")
+                    print(f"\nNew locus {new_locus_index} created after merging : {liste[new_locus_index]}\n")
+                    print(f"liste of locus at this point : {liste}")
+                    test = False
+                    break
+
+        # if no intersection (test still true):
+        # this is not done at the 1st iteration (see continue after if len(liste)==0)
+        if test:
+            # REPETITION due to TEST which is true so 2 APPEND per new locus
+            # at the fisrt time we have 2 locus in the list
+            print("No intersection, appended new locus")
+            liste.append(new_locus)  
+            locus_nb = locus_nb + 1
+            print(f"\nCHR{i}locus{locus_nb} is created !\n")
+            print(f"\n\n\n\nWe have {locus_nb} loci in our list : {liste}\n\n\n\n\n\n")
+
+    # checking if no overlaping loci
+    # merging loci while overlapping remaining
+    for i in range(len(liste)):
+        new_liste = checkListLocus(liste, pos)
+
+    #print("len final : " + str(len(new_liste))+"\n\n")
+    #print(f"nb locus final : {locus_nb}\n\n")
+
+    #print(f"\nliste after LocusList():{new_liste} \n")
+    print(f"\nDone splitting chromosome {i} into loci ! ")
+    # new_liste contains no more overlapping loci
+    return new_liste
+
+
+
+
+def ZscoreAdder(locus : tuple, Zhead : str, Effect : str, StdErr : str, pos : str, allele1 : str , allele2: str , chr : str, pvalue_header : str ) -> pd.DataFrame:
+    chr_nb,zLocus = locus
+
+    zLocus[Zhead] = (zLocus[Effect]/zLocus[StdErr])
+
+    #Drop all columns that ARE NOT : CHR BP ALLELE1 ALLELE2 EFFECT STDERR PAVLUE ZSCORE
+    columns_to_keep = [chr, pos, allele1, allele2, Effect, StdErr, pvalue_header, Zhead]
+    #print(f"columns_to_keep : {columns_to_keep}")
+        # Determine columns to drop
+    columns_to_drop = zLocus.columns.difference(columns_to_keep)
+        # Drop columns
+    zLocus.drop(inplace=True, columns=columns_to_drop)
+        # Keep specified columns
+    zLocus = zLocus[columns_to_keep]
+    
+    return (chr_nb,zLocus)
+
+
+
+def printLocus(liste : "list[tuple]", Zhead : str, Effect : str, StdErr : str, outdir : str, pos : str, allele1 : str, allele2 : str, chr : str, pvalue_header : str) -> None:
+    """
+    writes locus files
+    """
+    for i in range(len(liste)) :
+        chr_nb,_ = liste[i]
+
+        # liste[i] is the tuple (chrid,locus)
+        #_,_,locusZ = ZscoreAdder(liste[i], Zhead, Effect, StdErr)
+        _,locusZ = ZscoreAdder(liste[i], Zhead, Effect, StdErr, pos, allele1, allele2, chr, pvalue_header)
+
+        if len(str(chr_nb)) == 1:
+            locusZ.to_csv(f"{outdir}/CHR0{chr_nb}locus{i+1}", index=False, sep=' ')
+        elif len(str(chr_nb)) == 2:
+            locusZ.to_csv(f"{outdir}/CHR{chr_nb}locus{i+1}", index=False, sep=' ')
+        print(f"CHR{chr_nb}Locus{i+1} printed !\n")
+
+    return None
+
+# ----------------------------------------------------------------------------
+
+
+
+# MAIN  ----------------------------------------------------------------------
+
+def main() -> int:
+    parser = OptionParser()
+    parser.add_option("-d", "--data", dest="data_bank", default="data/input/CAD_META")                  #data bank file directory and name
+    parser.add_option("--sp", "--separator", dest="separator", default="\t")                            #separator used in data bank file: ' ',  '\t',  ';'...
+    parser.add_option("--pv-header", "--pvalue-header", dest="pvalue_header", default="Pvalue")         #P-value header
+    parser.add_option("--st", "--stderr", dest="stderror", default="StdErr")                            #Standart Error header
+    parser.add_option("-e", "--effect", dest="effect", default="Effect")                                #Effect header
+    parser.add_option("--chr", "--chromosome", dest="chr", default="CHR")                               #Chromosome number header
+    parser.add_option("--a1", "--effect-allele", dest="a1", default="Allele1")                          #Effect allele header
+    parser.add_option("--a2", "--alt-allele", dest="a2", default="Allele2")                             #Alternative allele header
+    parser.add_option("--pos", "--position", dest="pos", default ="BP")                                 #Position of SNP header
+    parser.add_option("-z", "--Zheader", dest="Zhead", default="Zscore")                                #Header name of Zscore new column, "Zscore" recommended for PAINTOR
+    parser.add_option("--kb", "--up-down-kb", dest="kb", default=500)                                   #Number of kb upstream and downstream of the best SNP(best pvalue) for each locus
+    parser.add_option("--pv-threshold", "--pvalue-threshold", dest="pvalue_threshold", default=5e-08)   #Value for the pvalue treshold
+    parser.add_option("-o", "--outname", dest="outname", default ="CHRnLocusm")                         #Locus output name format 
+    parser.add_option("--od", "--outdir", dest="outdir", default ="data/output/locus_output")           #Locus output directory
+    (options, args) = parser.parse_args()
+
+
+    data_bank = options.data_bank
+    sep = options.separator
+    pvalue_header = options.pvalue_header
+    std = options.stderror
+    effect = options.effect
+    chr = options.chr
+    allele1 = options.a1
+    allele2 = options.a2
+    pos = options.pos
+    zhead = options.Zhead
+    kb = options.kb
+    pvalue_threshold = options.pvalue_threshold
+    out = options.outname
+    outdir = options.outdir
+    
+    #TODO : check CAD_META readme for Header info
+    usage = """Usage:  
+        -d (required) specify data bank path (default is "data" in the same directory as this programm)
+        --sp specifiy the separator character used in data bank file (default is a tab '\t')
+        --pv specifiy the P-value header used in data bank file (default is P-Value)
+        --st specifiy the Standart Error header used in data bank file (default is StdErr)
+        -e specifiy the Effect header used in data bank file (default is Effect)
+        --chr specifiy the Chromosome number header used in data bank file (default is CHR)
+        --a1 specifiy the Effect Allele header used in data bank file (default is Allele1)
+        --a2 specifiy the Alernative Allele used in data bank file (default is Allele2)
+        --pos specifiy the SNP Position header used in data bank file (default is BP)
+        -z specifiy the wanted Zscore header used in data bank file (default is "Zscore")
+        --kb specifiy the wanted number of kilo base upstream and downstream of the best SNP(best pvalue) for each locus
+        --pv-threshold specifiy the wanted pvalue threshold
+        --od specifiy the wanted output directory (default is the output directory in the data directory)
+        -o (WIP) (optional) specifiy output format name
+        """
+
+    if(data_bank == None):
+        sys.exit(usage)
+
+    debut = time.time()
+    
+    # makes a lits of 22 dataframes, one for each chromosome, each including the snps of the chromosome in question
+    chromosomes_list = ChromosomeSplitter(data_bank, sep, chr)
+
+    p=Pool(22)
+    p.map(lambda c : printLocus(LocusList(c, pvalue_header, pos, kb, pvalue_threshold), zhead, effect, std, outdir, pos, allele1, allele2, chr, pvalue_header),chromosomes_list)
+
+    print("\n\n\n")
+    print("~~~~~ main finished in %s seconds ~~~~~\n" % (time.time() - debut))
+
+    return 0
+# ----------------------------------------------------------------------------
+
+
+if __name__ == "__main__": main()