...
 
Commits (6)
#!/usr/bin/env python3
import argparse
import csv
import json
import traceback
......@@ -8,27 +7,10 @@ import traceback
from tqdm import tqdm
from models import StrainWrapped
from utils import cache_json
from utils import build_argument_parser
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', default='output.csv', help="where to write the output file")
parser.add_argument('-i', '--input', default='genomes_proks815-genomesonnovember.sample.csv', help="file to read")
# parser.add_argument('--biosample-attributes', dest='bio_sample_attributes', default='bio-sample-attributes.txt',
# help="A text file where each line is an attribute to fetch when parsing a BioSample entry")
parser.add_argument(
'--limit',
default=-1,
type=int,
help="max row to read in the input file, a value <= 0 means all rows.",
)
parser.add_argument('-d', '--delimiter', default='\t', help="delimiter to use in both input and output file")
parser.add_argument('-v', '--verbose', action='store_true')
parser.add_argument('--no-cache', dest='no_cache', action='store_true', help="Neither use or save to file cache")
args = parser.parse_args()
if args.no_cache:
cache_json.disk_cache_enabled = False
def fetch_attribute(args):
bio_samples_attributes_raw = {}
with open(args.input, 'r') as csv_input_file:
csv_reader = csv.reader(csv_input_file, delimiter=args.delimiter, quotechar='|')
......@@ -50,9 +32,14 @@ if __name__ == '__main__':
cpt -= 1
values = {}
not_counted_values = {
"missing",
"Unknown",
"not available",
"MISSING",
"UNKNOWN",
"NA",
"N/A",
"NOT COLLECTED",
"NOT PROVIDED",
"NOT AVAILABLE",
"NOT APPLICABLE",
}
for strain in tqdm(strains):
for attr_name, aka, value in strain.get_bio_sample_attributes_and_value():
......@@ -63,7 +50,7 @@ if __name__ == '__main__':
continue
if " " in candidate:
continue
if value in not_counted_values:
if value.upper() in not_counted_values:
continue
cpt[1] += 1
cpt = bio_samples_attributes_raw.setdefault(attr_name, [0, 0])
......@@ -85,15 +72,15 @@ if __name__ == '__main__':
continue
stats["cpt"] += 1
stats["values"].add(value)
print(len(translations))
for f, t in translations:
print(f, "\t-->\t", t)
# print(len(translations))
# for f, t in translations:
# print(f, "\t-->\t", t)
bio_samples_attributes = {k: v for k, v in
sorted(bio_samples_attributes.items(), key=lambda item: item[1]["cpt"], reverse=True)}
with open("bio_samples_attributes.csv", 'w') as bio_samples_attributes_txt:
for k, v in bio_samples_attributes.items():
v["values"]=list(v["values"])
v["values"] = list(v["values"])
bio_samples_attributes_txt.write("%s\t%i\t%s\n" % (k, v["cpt"], "\t".join(v["values"][:5])))
with open("bio_samples_attributes_raw.json", 'w') as output_json:
......@@ -102,3 +89,11 @@ if __name__ == '__main__':
output_json.write(json.dumps(bio_samples_attributes, indent=4))
with open("values.json", 'w') as output_json:
output_json.write(json.dumps(values, indent=4))
min_cpt = len(strains) * args.threshold
return {k: v for k, v in bio_samples_attributes.items() if v["cpt"] > min_cpt}, strains
if __name__ == '__main__':
selected_attributes = fetch_attribute(build_argument_parser())
# print(selected_attributes)
#!/usr/bin/env python3
import argparse
import csv
import traceback
from tqdm import tqdm
from models import StrainWrapped
from utils import cache_json
from attributs_fetcher import fetch_attribute
from utils import build_argument_parser
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', default='output.csv', help="where to write the output file")
parser.add_argument('-i', '--input', default='genomes_proks815-genomesonnovember.sample.csv', help="file to read")
parser.add_argument(
'--limit',
default='-1',
type=int,
help="max row to read in the input file, a value <= 0 means all rows.",
)
parser.add_argument('-d', '--delimiter', default='\t', help="delimiter to use in both input and output file")
parser.add_argument('-v', '--verbose', action='store_true')
parser.add_argument('--no-cache', dest='no_cache', action='store_true', help="Neither use or save to file cache")
args = parser.parse_args()
if args.no_cache:
cache_json.disk_cache_enabled = False
output_header = ["Strain", "BioSample", "Assembly", "Date", "Location", "Source", "Origin", "Host",
"Sequencing", "Assembling", "Coverage"]
args = build_argument_parser()
selected_attributes, strains = fetch_attribute(build_argument_parser())
output_header = [
"Strain", "BioSample", "Assembly",
"Date",
"Location",
"Source",
"Origin",
"Host",
"Sequencing",
"Assembling",
"Coverage",
]
selected_attributes_start_at = len(output_header)
for k, _ in selected_attributes.items():
output_header.append(k)
strain_output_pos = output_header.index('Strain')
bio_sample_output_pos = output_header.index('BioSample')
......@@ -38,33 +39,41 @@ if __name__ == '__main__':
sequencing_output_pos = output_header.index('Sequencing')
coverage_output_pos = output_header.index('Coverage')
assembling_output_pos = output_header.index('Assembling')
with open(args.input, 'r') as csv_input_file, open(args.output, 'w') as csv_output_file:
csv_reader = csv.reader(csv_input_file, delimiter=args.delimiter, quotechar='|')
with open(args.output, 'w') as csv_output_file: # open(args.input, 'r') as csv_input_file,
# csv_reader = csv.reader(csv_input_file, delimiter=args.delimiter, quotechar='|')
csv_writer = csv.writer(csv_output_file, delimiter=args.delimiter)
header = next(csv_reader)
strain_pos = header.index('Strain')
bio_sample_pos = header.index('BioSample')
assembly_pos = header.index('Assembly')
# strain_pos = header.index('Strain')
# bio_sample_pos = header.index('BioSample')
# assembly_pos = header.index('Assembly')
csv_writer.writerow(output_header)
if args.verbose:
print(output_header)
cpt = -1
if args.limit > 0:
cpt = args.limit
for row in tqdm(csv_reader):
for strain in tqdm(strains):
results = [""] * len(output_header)
strain = StrainWrapped(row[strain_pos], row[bio_sample_pos], row[assembly_pos])
results[strain_output_pos] = strain.strain
results[bio_sample_output_pos] = strain.bio_sample
results[assembly_output_pos] = strain.assembly
results[host_output_pos] = strain.host
results[location_output_pos] = strain.location
results[date_output_pos] = strain.isolation_date
results[source_output_pos] = strain.source
results[origin_output_pos] = strain.origin
results[sequencing_output_pos] = strain.sequencing
results[coverage_output_pos] = strain.coverage
results[assembling_output_pos] = strain.assembling
# strain = StrainWrapped(row[strain_pos], row[bio_sample_pos], row[assembly_pos])
try:
results[strain_output_pos] = strain.strain
results[bio_sample_output_pos] = strain.bio_sample
results[assembly_output_pos] = strain.assembly
for relative_pos, biosample_attr in enumerate(output_header[selected_attributes_start_at:]):
results[relative_pos + selected_attributes_start_at] = strain.bio_sample_sample_data_attr(
biosample_attr)
# results[host_output_pos] = strain.host
# results[location_output_pos] = strain.location
# results[date_output_pos] = strain.isolation_date
# results[source_output_pos] = strain.source
# results[origin_output_pos] = strain.origin
results[sequencing_output_pos] = strain.sequencing
results[coverage_output_pos] = strain.coverage
results[assembling_output_pos] = strain.assembling
except Exception as e:
print(strain)
print(results)
traceback.print_exc()
raise e
if args.verbose:
print(results)
csv_writer.writerow(results)
......
#!/usr/bin/env python3
from datetime import datetime
from xml.etree.ElementTree import fromstring, fromstringlist
from record_extractor import bio_sample_record, assembly_record, sra_records
......@@ -42,12 +42,6 @@ class StrainWrapped:
# return bio_sample_record(self.bio_sample)["DocumentSummarySet"]["DocumentSummary"][0]["Date"]
return self._bio_sample_sample_data_attr("collection_date")
@property
def source(self):
# ville campagne
# return bio_sample_record(self.bio_sample)["DocumentSummarySet"]["DocumentSummary"][0]["SourceSample"]
return "TODO"
@property
def origin(self):
# prélévement sanguin ? bucale ? ...
......@@ -55,7 +49,10 @@ class StrainWrapped:
@property
def coverage(self):
return assembly_record(self.assembly)["DocumentSummarySet"]["DocumentSummary"][0]["Coverage"]
try:
return assembly_record(self.assembly)["DocumentSummarySet"]["DocumentSummary"][0]["Coverage"]
except IndexError:
return None
@property
def sequencing(self):
......@@ -67,18 +64,77 @@ class StrainWrapped:
def assembling(self):
# <assembly-level>5</assembly-level>
# <assembly-status>Complete Genome</assembly-status>
my_xml = fromstringlist([
"<wrap>",
assembly_record(self.assembly)["DocumentSummarySet"]["DocumentSummary"][0]["Meta"],
"</wrap>",
])
for node in my_xml.iterfind(".//assembly-status"):
return node.text
try:
my_xml = fromstringlist([
"<wrap>",
assembly_record(self.assembly)["DocumentSummarySet"]["DocumentSummary"][0]["Meta"],
"</wrap>",
])
return ", ".join([node.text for node in my_xml.iterfind(".//assembly-status")])
except IndexError:
return None
def _bio_sample_sample_data_attr(self, attr_name):
attr = self._bio_sample_sample_data.find(".//*[@attribute_name=\"" + attr_name + "\"]")
return attr.text if attr is not None else None
def bio_sample_sample_data_attr(self, attr_name):
attr = self._bio_sample_sample_data_attr(attr_name)
if attr is not None and attr.upper() in [
"MISSING",
"UNKNOWN",
"NA",
"N/A",
"NOT COLLECTED",
"NOT PROVIDED",
"NOT AVAILABLE",
"NOT APPLICABLE",
]:
return None
if "collection_date" == attr_name and attr is not None:
if attr.upper() in [
"MISSING",
"UNKNOWN",
"NA",
"NOT COLLECTED",
"NOT PROVIDED",
"NOT AVAILABLE",
"NOT APPLICABLE",
]:
return None
for in_frmt, out_frmt in [
('%d-%B-%Y', '%Y-%m-%d'),
('%B-%Y', '%Y-%m'),
('%d-%b-%Y', '%Y-%m-%d'),
('%b-%Y', '%Y-%m'),
]:
try:
parsed_date = datetime.strptime(attr, in_frmt)
return parsed_date.strftime(out_frmt)
except ValueError:
pass
elif "host" == attr_name and attr is not None:
if attr == "Homo sapiens sapiens":
attr = "Homo sapiens"
if attr is not None:
return attr
if "lat_lon" == attr_name:
lat = self._bio_sample_sample_data_attr("geographic location (latitude)")
lon = self._bio_sample_sample_data_attr("geographic location (longitude)")
if lat is None or lon is None:
return attr
try:
lat = float(lat)
lon = float(lon)
lat = str(lat) + (" S " if lat < 0 else " N ")
lon = str(lon) + (" W" if lon < 0 else " E")
return lat + lon
except ValueError:
pass
except TypeError:
pass
return None
def _sra_attrs(self, attr_name):
for entry in sra_records(self.bio_sample):
my_xml = fromstringlist([
......
#!/usr/bin/env python3
import argparse
import csv
import json
import os
import traceback
from xml.etree.ElementTree import fromstring, fromstringlist
def cache_json(function):
mem_cache = {}
......@@ -33,3 +31,29 @@ def cache_json(function):
cache_json.disk_cache_enabled = True
def build_argument_parser():
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', default='output.csv', help="where to write the output file")
parser.add_argument('-i', '--input', default='genomes_proks815-genomesonnovember.sample.csv', help="file to read")
parser.add_argument(
'--limit',
default='-1',
type=int,
help="max row to read in the input file, a value <= 0 means all rows.",
)
parser.add_argument(
'--attribute-threshold',
dest="threshold",
default='0.01',
type=float,
help="an attribut is extracted only if it is filled in entry_count*attribut-threshold",
)
parser.add_argument('-d', '--delimiter', default='\t', help="delimiter to use in both input and output file")
parser.add_argument('-v', '--verbose', action='store_true')
parser.add_argument('--no-cache', dest='no_cache', action='store_true', help="Neither use or save to file cache")
args = parser.parse_args()
if args.no_cache:
cache_json.disk_cache_enabled = False
return args