Skip to content
Snippets Groups Projects
Select Git revision
  • b7c628aa1df65e577f9f33c5a001fc26f1322293
  • master default protected
2 results

wrapper.py

Blame
  • find_common_raws.py 1.22 KiB
    #!/usr/bin/env python3
    # vim: set fileencoding=<utf-8> :
    """
    
    This script tries to identify common raw data files between submission based on
    saved md5sums. md5sums are expected to be in tab-separated files named raw.tsv,
    with file name on the first column and md5 on the third.
    
    """
    import sys
    
    
    major, minor = sys.version_info[:2]
    if major < 3 or (major == 3 and minor < 6):
        sys.exit("Need at least python 3.6\n")
    
    
    from itertools import combinations
    import pandas as pd
    
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)
    
    
    def main():
        tabs = {
            fname: pd.read_table(fname, sep="\t", header=None, index_col=2)
            for fname in sys.argv[1:]
        }
        for (fname1, fname2) in combinations(tabs.keys(), 2):
            tab1 = tabs[fname1]
            tab2 = tabs[fname2]
            common_idx = tab1.index.intersection(tab2.index)
            if len(common_idx):
                print(f"{len(common_idx)} common md5sums between {fname1} and {fname2}.")
                print(common_idx)
                common = tab1.join(tab2, how="inner", lsuffix=f"_{fname1}", rsuffix=f"_{fname2}")
                print(common)
        return 0
    
    
    sys.exit(main())