...
 
Commits (2)
......@@ -29,7 +29,7 @@ if __name__ == '__main__':
if args.no_cache:
cache_json.disk_cache_enabled = False
bio_samples_attributes = {}
bio_samples_attributes_raw = {}
with open(args.input, 'r') as csv_input_file:
csv_reader = csv.reader(csv_input_file, delimiter=args.delimiter, quotechar='|')
header = next(csv_reader)
......@@ -37,11 +37,10 @@ if __name__ == '__main__':
bio_sample_pos = header.index('BioSample')
assembly_pos = header.index('Assembly')
cpt = args.limit
strains = []
for row in tqdm(csv_reader):
try:
strain = StrainWrapped(row[strain_pos], row[bio_sample_pos], row[assembly_pos])
for attr_name, _ in strain.get_bio_sample_attributes():
bio_samples_attributes.setdefault(attr_name, [0])[0] += 1
strains.append(StrainWrapped(row[strain_pos], row[bio_sample_pos], row[assembly_pos]))
except Exception as e:
print(row)
traceback.print_exc()
......@@ -49,12 +48,57 @@ if __name__ == '__main__':
if cpt == 0:
break
cpt -= 1
values = {}
not_counted_values = {
"missing",
"Unknown",
"not available",
}
for strain in tqdm(strains):
for attr_name, aka, value in strain.get_bio_sample_attributes_and_value():
values.setdefault(value, [0])[0] += 1
for candidate in aka:
cpt = bio_samples_attributes_raw.setdefault(candidate, [0, 0])
if "-" in candidate:
continue
if " " in candidate:
continue
if value in not_counted_values:
continue
cpt[1] += 1
cpt = bio_samples_attributes_raw.setdefault(attr_name, [0, 0])
if value in not_counted_values:
continue
cpt[0] += 1
values = {k: v for k, v in
sorted(values.items(), key=lambda item: item[1], reverse=True)}
translations = set()
bio_samples_attributes = {}
for strain in tqdm(strains):
for attr_name, aka, value in strain.get_bio_sample_attributes_and_value():
translated_attr_name = max(aka, key=lambda x: bio_samples_attributes_raw[x][1])
stats = bio_samples_attributes.setdefault(translated_attr_name, {"cpt": 0, "values": set()})
if translated_attr_name != attr_name:
translations.add((attr_name, translated_attr_name))
if value in not_counted_values:
continue
stats["cpt"] += 1
stats["values"].add(value)
print(len(translations))
for f, t in translations:
print(f, "\t-->\t", t)
bio_samples_attributes = {k: v for k, v in
sorted(bio_samples_attributes.items(), key=lambda item: item[1], reverse=True)}
with open("bio_samples_attributes.txt", 'w') as bio_samples_attributes_txt:
sorted(bio_samples_attributes.items(), key=lambda item: item[1]["cpt"], reverse=True)}
with open("bio_samples_attributes.csv", 'w') as bio_samples_attributes_txt:
for k, v in bio_samples_attributes.items():
bio_samples_attributes_txt.write("%s\t%i\n" % (k, v[0]))
v["values"]=list(v["values"])
bio_samples_attributes_txt.write("%s\t%i\t%s\n" % (k, v["cpt"], "\t".join(v["values"][:5])))
with open("bio_samples_attributes.json", 'w') as bio_samples_attributes_json:
bio_samples_attributes_json.write(json.dumps(bio_samples_attributes, indent=4))
with open("bio_samples_attributes_raw.json", 'w') as output_json:
output_json.write(json.dumps(bio_samples_attributes_raw, indent=4))
with open("bio_samples_attributes.json", 'w') as output_json:
output_json.write(json.dumps(bio_samples_attributes, indent=4))
with open("values.json", 'w') as output_json:
output_json.write(json.dumps(values, indent=4))
......@@ -20,9 +20,12 @@ class StrainWrapped:
def get_bio_sample_attributes(self):
for attr in self._bio_sample_sample_data.iterfind(".//*[@attribute_name]"):
aka = set(attr.attrib.values())
attribute_name = attr.attrib['attribute_name']
aka.remove(attribute_name)
yield attribute_name, aka
yield attr.attrib['attribute_name'], aka
def get_bio_sample_attributes_and_value(self):
for attr in self._bio_sample_sample_data.iterfind(".//*[@attribute_name]"):
aka = set(attr.attrib.values())
yield attr.attrib['attribute_name'], aka, attr.text
@property
def host(self):
......