Skip to content
Snippets Groups Projects
Commit 7b983ae5 authored by Blaise Li's avatar Blaise Li
Browse files

Find duplicates between more than one submission.

parent b5a44307
No related branches found
No related tags found
No related merge requests found
......@@ -15,19 +15,29 @@ if major < 3 or (major == 3 and minor < 6):
sys.exit("Need at least python 3.6\n")
from itertools import combinations
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
def main():
tab1 = pd.read_table(sys.argv[1], sep="\t", header=None, index_col=2)
tab2 = pd.read_table(sys.argv[2], sep="\t", header=None, index_col=2)
common_idx = tab1.index.intersection(tab2.index)
print(f"{len(common_idx)} common md5sums.")
print(common_idx)
common = tab1.join(tab2, how="inner", lsuffix="_left", rsuffix="_right")
print(common)
tabs = {
fname: pd.read_table(fname, sep="\t", header=None, index_col=2)
for fname in sys.argv[1:]
}
for (fname1, fname2) in combinations(tabs.keys(), 2):
tab1 = tabs[fname1]
tab2 = tabs[fname2]
common_idx = tab1.index.intersection(tab2.index)
if len(common_idx):
print(f"{len(common_idx)} common md5sums between {fname1} and {fname2}.")
print(common_idx)
common = tab1.join(tab2, how="inner", lsuffix=f"_{fname1}", rsuffix=f"_{fname2}")
print(common)
return 0
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment