Select Git revision
find_common_raws.py 1.22 KiB
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""
This script tries to identify common raw data files between submission based on
saved md5sums. md5sums are expected to be in tab-separated files named raw.tsv,
with file name on the first column and md5 on the third.
"""
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
sys.exit("Need at least python 3.6\n")
from itertools import combinations
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
def main():
tabs = {
fname: pd.read_table(fname, sep="\t", header=None, index_col=2)
for fname in sys.argv[1:]
}
for (fname1, fname2) in combinations(tabs.keys(), 2):
tab1 = tabs[fname1]
tab2 = tabs[fname2]
common_idx = tab1.index.intersection(tab2.index)
if len(common_idx):
print(f"{len(common_idx)} common md5sums between {fname1} and {fname2}.")
print(common_idx)
common = tab1.join(tab2, how="inner", lsuffix=f"_{fname1}", rsuffix=f"_{fname2}")
print(common)
return 0
sys.exit(main())