...
 
Commits (2)
genomes_proks815-genomesonnovember.csv
output.csv
.idea
.venv
\ No newline at end of file
#!/usr/bin/env python3
import argparse
import csv
import os
from Bio import Entrez
Entrez.email = os.environ.get("NCBI_EMAIL", None)
Entrez.api_key = os.environ.get("NCBI_API_KEY", None)
class StrainWrapped:
def __init__(self, strain, bio_sample, assembly):
self.strain = strain
self.bio_sample = bio_sample
self.assembly = assembly
self.__host = None
self.__sample_data = None
@property
def sample_data(self):
if self.__sample_data is not None:
return self.__sample_data
with Entrez.esearch(db="biosample", retmax=2, term=self.bio_sample) as handle:
record = Entrez.read(handle)
if int(record["Count"]) > 1:
with Entrez.esearch(db="biosample", retmax=2, term="%s AND (latest[filter])" % self.bio_sample) as handle:
record = Entrez.read(handle)
id = record["IdList"]
with Entrez.efetch(db="biosample", id=id, retmode="xml", rettype="docsum") as handle:
# record = Entrez.read(handle)
# docsum = handle.read()
record = Entrez.read(handle)
# print(json.dumps(record, indent=4))
sample_data = record["DocumentSummarySet"]["DocumentSummary"][0]["SampleData"]
from xml.etree.ElementTree import fromstring
myxml = fromstring(sample_data)
# print(myxml)
# if "host" in sample_data:
# exit(2)
return myxml
@property
def host(self):
if self.__host is not None:
return self.__host
host = self.sample_data.find("Attributes/Attribute[@attribute_name=\"host\"]")
self.__host = host.text if host is not None else None
return self.__host
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', default='output.csv', help="where to write the output file")
parser.add_argument('-i', '--input', default='genomes_proks815-genomesonnovember.sample.csv', help="file to read")
parser.add_argument('-d', '--delimiter', default='\t', help="delimiter to use in both input and output file")
parser.add_argument('-v', dest='verbose', action='store_true')
args = parser.parse_args()
output_header = ["Strain", "BioSample", "Assembly", "Host"]
strain_output_pos = output_header.index('Strain')
bio_sample_output_pos = output_header.index('BioSample')
assembly_output_pos = output_header.index('Assembly')
with open(args.input, 'r') as csv_input_file, open(args.output, 'w') as csv_output_file:
csv_reader = csv.reader(csv_input_file, delimiter=args.delimiter, quotechar='|')
csv_writer = csv.writer(csv_output_file, delimiter=args.delimiter)
header = next(csv_reader)
strain_pos = header.index('Strain')
bio_sample_pos = header.index('BioSample')
assembly_pos = header.index('Assembly')
print(header)
for row in csv_reader:
results = [""] * len(output_header)
strain = StrainWrapped(row[strain_pos], row[bio_sample_pos], row[assembly_pos])
results[strain_output_pos] = strain.strain
results[bio_sample_output_pos] = strain.bio_sample
results[assembly_output_pos] = strain.assembly
results[-1] = strain.host
print(results)
biopython
\ No newline at end of file