Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
genolist
legiolist
Commits
f014a333
Commit
f014a333
authored
Feb 11, 2020
by
Remi PLANEL
Browse files
Add script to convert genome to gene multiple fasta file
parent
67a8dbe9
Changes
1
Show whitespace changes
Inline
Side-by-side
scripts/split-genome-to-gene.py
0 → 100755
View file @
f014a333
#!/usr/bin/env python3
from
Bio
import
SeqIO
from
Bio.SeqRecord
import
SeqRecord
from
pathlib
import
Path
import
click
from
itertools
import
chain
import
re
@
click
.
command
()
@
click
.
option
(
"-d"
,
"--directory"
,
type
=
click
.
Path
(
exists
=
True
,
dir_okay
=
True
,
file_okay
=
True
),
help
=
"Genome input source"
,
)
@
click
.
option
(
"-o"
,
"--output-dir"
,
type
=
click
.
Path
(
"wb"
),
help
=
"path to the output dir"
)
@
click
.
option
(
"-f"
,
"--output-format"
,
type
=
str
,
help
=
"output sequence format"
)
@
click
.
option
(
"-t"
,
"--feature-type"
,
type
=
str
,
help
=
""
)
def
split_genomes
(
directory
,
output_dir
,
output_format
,
feature_type
):
genomes
=
gen_path_with_format
(
directory
)
gen_genomes
=
gen_genomes_with_steam
(
genomes
)
for
filename
,
genome
in
gen_genomes
:
write_sequence
(
genome
,
filename
,
output_dir
,
output_format
,
feature_type
)
def
write_sequence
(
genome
,
filename
,
directory
,
sequence_format
,
feature_type
):
with
open
(
Path
(
directory
)
/
(
filename
+
"."
+
sequence_format
),
"w"
)
as
output_handle
:
print
(
"# Write file "
+
filename
)
for
record
in
genome
:
for
feat
in
record
.
features
:
if
feat
.
type
==
feature_type
:
feature_seq
=
feat
.
extract
(
record
.
seq
)
seq_record
=
SeqRecord
(
feature_seq
)
locus_tags
=
feat
.
qualifiers
.
get
(
"locus_tag"
)
product
=
feat
.
qualifiers
.
get
(
"product"
)
if
locus_tags
and
len
(
locus_tags
)
>
0
:
seq_record
.
id
=
locus_tags
[
0
]
if
product
:
p
=
re
.
compile
(
r
"^\s+$"
)
if
not
p
.
match
(
product
[
0
]):
seq_record
.
description
=
product
[
0
]
SeqIO
.
write
(
seq_record
,
output_handle
,
sequence_format
)
else
:
raise
(
"No locus tag !!!"
)
def
gen_path_with_format
(
input_str
):
input_path
=
Path
(
input_str
)
if
input_path
.
is_dir
():
for
genome_path
in
chain
(
map
(
lambda
genome
:
(
"embl"
,
genome
),
input_path
.
rglob
(
"*.embl"
)),
map
(
lambda
genome
:
(
"genbank"
,
genome
),
input_path
.
rglob
(
"*.genbank"
)),
):
yield
genome_path
else
:
suffix
=
input_path
.
suffix
if
suffix
==
".embl"
:
yield
(
"embl"
,
input_path
)
else
:
yield
(
"genbank"
,
input_path
)
def
gen_genomes_with_steam
(
genomes
):
for
genome_format
,
genome
in
genomes
:
yield
(
str
(
Path
(
genome
.
stem
)),
SeqIO
.
parse
(
genome
,
genome_format
))
if
__name__
==
"__main__"
:
split_genomes
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment