Commit 5fb40ecd authored by Remi  PLANEL's avatar Remi PLANEL
Browse files

Merge branch 'jbrowse' into 'master'

Jbrowse

See merge request !1
parents 52d33f5a 2868d811
......@@ -26,6 +26,9 @@ out/
.gradle/
*.code-workspace
.vscode
**/.classpath
**/.project
#
organisms.xml
publications.xml
systemProp.imVersion=4.1.+
systemProp.bioVersion=4.1.+
systemProp.blueGenesVersion=0.9.9-SNAPSHOT
systemProp.blueGenesVersion=0.9.12
systemProp.javax.xml.stream.XMLOutputFactory=com.sun.xml.internal.stream.XMLOutputFactoryImpl
......@@ -11,17 +11,40 @@
<property name="gff3.dataSourceName" value="LegioList"/>
<property name="gff3.seqClsName" value="Chromosome"/>
<property name="gff3.dataSetTitle" value="LegioList Legionella pneumophila str. Paris genome annotation"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/LegioParis"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/gff/clean/LegioParis_10-05-2012_LegioList/"/>
</source>
<source name="LegioParis-chromosome-fasta" type="fasta">
<property name="fasta.taxonId" value="297246"/>
<property name="fasta.className" value="org.intermine.model.bio.Chromosome"/>
<property name="fasta.dataSourceName" value="LegioList"/>
<property name="fasta.dataSetTitle" value="LegioList Legionella pneumophila str. Paris chromosome sequence"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/fasta/legiolist"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/fasta"/>
<property name="fasta.includes" value="LegioParis_10-05-2012_LegioList.fasta"/>
</source>
<source name="LegioParis-ncbi-gff" type="legiolist-gff">
<source name="LegioParis-protein-gene" type="legiolist-gene-prot">
<property name="legiolist.taxid" value="297246"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/protein"/>
<property name="src.data.dir.includes" value="LegioParis_10-05-2012_LegioList-prot-gene.tsv"/>
</source>
<source name="LegioParis-protein-fasta" type="fasta">
<property name="fasta.taxonId" value="297246"/>
<property name="fasta.dataSetTitle" value="Legiolist protein sequence"/>
<property name="fasta.className" value="org.intermine.model.bio.Protein"/>
<property name="fasta.classAttribute" value="primaryIdentifier"/>
<property name="fasta.sequenceType" value="protein" />
<property name="fasta.sequenceType" value="protein" />
<property name="fasta.suffix" value="protein" />
<property name="fasta.includes" value="LegioParis_10-05-2012_LegioList.faa"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/protein"/>
</source>
<!-- <source name="LegioParis-ncbi-gff" type="legiolist-gff">
<property name="gff3.taxonId" value="297246"/>
<property name="gff3.seqDataSourceName" value="NCBI LegioParis"/>
<property name="gff3.dataSourceName" value="NCBI"/>
......@@ -38,7 +61,7 @@
<property name="fasta.includes" value="GCF_000048645.1_ASM4864v1_genomic.fna"/>
</source>
-->
<source name="Lfallonii-gff" type="legiolist-gff">
......@@ -47,7 +70,7 @@
<property name="gff3.dataSourceName" value="LegioList"/>
<property name="gff3.seqClsName" value="Chromosome"/>
<property name="gff3.dataSetTitle" value="Legionella fallonii genome annotation"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/Lfallonii"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/gff/clean/Lfallonii"/>
</source>
<source name="Lfallonii-chromosome-fasta" type="fasta">
<property name="fasta.className" value="org.intermine.model.bio.Chromosome"/>
......@@ -55,10 +78,29 @@
<property name="fasta.dataSetTitle" value="LegioList Legionella fallonii chromosome sequence"/>
<property name="fasta.taxonId" value="96230"/>
<property name="fasta.includes" value="Lfallonii.fasta"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/fasta/legiolist"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/fasta"/>
</source>
<source name="Lfallonii-protein-gene" type="legiolist-gene-prot">
<property name="legiolist.taxid" value="96230"/>
<property name="legiolist-gene-prot.dataSetTitle" value="LegioList Legionella fallonii proteins"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/protein"/>
<property name="src.data.dir.includes" value="Lfallonii-prot-gene.tsv"/>
</source>
<source name="Lfallonii-protein-fasta" type="fasta">
<property name="fasta.taxonId" value="96230"/>
<property name="fasta.dataSetTitle" value="Legiolist Lfallonii protein sequence"/>
<property name="fasta.className" value="org.intermine.model.bio.Protein"/>
<property name="fasta.classAttribute" value="primaryIdentifier"/>
<property name="fasta.sequenceType" value="protein" />
<property name="fasta.sequenceType" value="protein" />
<property name="fasta.sequenceType" value="protein" />
<property name="fasta.suffix" value="protein" />
<property name="fasta.includes" value="Lfallonii.faa"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/protein"/>
</source>
<source name="Lhackeliae-gff" type="legiolist-gff">
<property name="gff3.taxonId" value="449"/>
......@@ -66,7 +108,7 @@
<property name="gff3.dataSourceName" value="LegioList"/>
<property name="gff3.seqClsName" value="Chromosome"/>
<property name="gff3.dataSetTitle" value="Legionella hackeliae genome annotation"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/Lhackeliae"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/gff/clean/Lhackeliae"/>
</source>
<source name="Lhackeliae-chromosome-fasta" type="fasta">
<property name="fasta.className" value="org.intermine.model.bio.Chromosome"/>
......@@ -74,7 +116,7 @@
<property name="fasta.dataSetTitle" value="LegioList Legionella hackeliae chromosome sequence"/>
<property name="fasta.taxonId" value="449"/>
<property name="fasta.includes" value="Lhackeliae.fasta"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/fasta/legiolist"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/fasta"/>
</source>
......@@ -85,7 +127,7 @@
<property name="gff3.dataSourceName" value="LegioList"/>
<property name="gff3.seqClsName" value="Chromosome"/>
<property name="gff3.dataSetTitle" value="Legionella micdadei genome annotation"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/Lmicadei"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/gff/clean/Lmicadei"/>
</source>
<source name="Lmicadei-chromosome-fasta" type="fasta">
<property name="fasta.className" value="org.intermine.model.bio.Chromosome"/>
......@@ -93,7 +135,7 @@
<property name="fasta.dataSetTitle" value="LegioList Legionella micdadei chromosome sequence"/>
<property name="fasta.taxonId" value="451"/>
<property name="fasta.includes" value="Lmicadei.fasta"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/fasta/legiolist"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/fasta"/>
</source>
......@@ -104,7 +146,7 @@
<property name="gff3.dataSourceName" value="LegioList"/>
<property name="gff3.seqClsName" value="Chromosome"/>
<property name="gff3.dataSetTitle" value="Legionella longbeachae NSW150 genome annotation"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/Long_Mage"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/gff/clean/Long_Mage"/>
</source>
<source name="Long_Mage-chromosome-fasta" type="fasta">
<property name="fasta.className" value="org.intermine.model.bio.Chromosome"/>
......@@ -112,7 +154,7 @@
<property name="fasta.dataSetTitle" value="LegioList Legionella longbeachae NSW150 chromosome sequence"/>
<property name="fasta.taxonId" value="661367"/>
<property name="fasta.includes" value="Long_Mage.fasta"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/fasta/legiolist"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/fasta"/>
</source>
......@@ -124,7 +166,7 @@
<property name="gff3.dataSourceName" value="LegioList"/>
<property name="gff3.seqClsName" value="Chromosome"/>
<property name="gff3.dataSetTitle" value="Legionella pneumophila subsp. pneumophila str. Lorraine genome annotation"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/Lorraine"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/gff/clean/Lorraine"/>
</source>
<source name="lorraine-chromosome-fasta" type="fasta">
<property name="fasta.className" value="org.intermine.model.bio.Chromosome"/>
......@@ -132,7 +174,7 @@
<property name="fasta.dataSetTitle" value="LegioList Legionella pneumophila subsp. pneumophila str. Lorraine chromosome sequence"/>
<property name="fasta.taxonId" value="1046632"/>
<property name="fasta.includes" value="Lorraine.fasta"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/fasta/legiolist"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/fasta"/>
</source>
......@@ -145,7 +187,7 @@
<property name="gff3.dataSourceName" value="LegioList"/>
<property name="gff3.seqClsName" value="Chromosome"/>
<property name="gff3.dataSetTitle" value="Legionella pneumophila 2300/99 Alcoy genome annotation"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/LpAlcoy"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/gff/clean/LpAlcoy"/>
</source>
<source name="LpAlcoy-chromosome-fasta" type="fasta">
<property name="fasta.className" value="org.intermine.model.bio.Chromosome"/>
......@@ -153,7 +195,7 @@
<property name="fasta.dataSetTitle" value="LegioList Legionella pneumophila 2300/99 Alcoy chromosome sequence"/>
<property name="fasta.taxonId" value="423212"/>
<property name="fasta.includes" value="LpAlcoy.fasta"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/fasta/legiolist"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/fasta"/>
</source>
......@@ -165,7 +207,7 @@
<property name="gff3.dataSourceName" value="LegioList"/>
<property name="gff3.seqClsName" value="Chromosome"/>
<property name="gff3.dataSetTitle" value="Legionella pneumophila str. Corby genome annotation"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/LpCorby"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/gff/clean/LpCorby"/>
</source>
<source name="LpCorby-chromosome-fasta" type="fasta">
<property name="fasta.className" value="org.intermine.model.bio.Chromosome"/>
......@@ -173,7 +215,7 @@
<property name="fasta.dataSetTitle" value="LegioList Legionella pneumophila str. Corby chromosome sequence"/>
<property name="fasta.taxonId" value="400673"/>
<property name="fasta.includes" value="LpCorby.fasta"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/fasta/legiolist"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/fasta"/>
</source>
......@@ -187,7 +229,7 @@
<property name="gff3.dataSourceName" value="LegioList"/>
<property name="gff3.seqClsName" value="Chromosome"/>
<property name="gff3.dataSetTitle" value="Legionella pneumophila str. Lens genome annotation"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/LpLens"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/gff/clean/LpLens"/>
</source>
<source name="LpLens-chromosome-fasta" type="fasta">
<property name="fasta.className" value="org.intermine.model.bio.Chromosome"/>
......@@ -195,7 +237,7 @@
<property name="fasta.dataSetTitle" value="LegioList Legionella pneumophila str. Lens chromosome sequence"/>
<property name="fasta.taxonId" value="297245"/>
<property name="fasta.includes" value="LpLens.fasta"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/fasta/legiolist"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/fasta"/>
</source>
......@@ -206,7 +248,7 @@
<property name="gff3.dataSourceName" value="LegioList"/>
<property name="gff3.seqClsName" value="Chromosome"/>
<property name="gff3.dataSetTitle" value="Legionella pneumophila subsp. pneumophila str. Philadelphia 1 genome annotation"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/LpPhila"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/gff/clean/LpPhila"/>
</source>
<source name="LpPhila-chromosome-fasta" type="fasta">
<property name="fasta.className" value="org.intermine.model.bio.Chromosome"/>
......@@ -214,10 +256,10 @@
<property name="fasta.dataSetTitle" value="LegioList Legionella pneumophila subsp. pneumophila str. Philadelphia 1 chromosome sequence"/>
<property name="fasta.taxonId" value="272624"/>
<property name="fasta.includes" value="LpPhila.fasta"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/fasta/legiolist"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/genome/fasta"/>
</source>
<source name="uniprot" type="uniprot">
<!-- <source name="uniprot" type="uniprot">
<property name="uniprot.organisms" value="272624 297246 297245 400673 423212 1046632 661367 451 449 96230"/>
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/uniprot"/>
<property name="creatego" value="true"/>
......@@ -225,12 +267,11 @@
<property name="allowduplicates" value="false"/>
<property name="loadfragments" value="true"/>
<property name="loadtrembl" value="true"/>
</source>
</source> -->
<source name="legiolist-homology" type="legiolist-homology">
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/"/>
<property name="src.data.dir.includes" value="table_orthos.txt"/>
<!-- taxids of the legio strain for each column -->
<property name="legiolistTaxids" value="272624 297246 297245 400673 423212 1046632 661367 451 449 96230"/>
</source>
......@@ -244,12 +285,12 @@
</source>
<source name="go" type="go">
<property name="src.data.file" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/go/go-basic.obo"/>
<property name="src.data.file" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data/go-basic.obo"/>
</source>
<source name="pubmed-gene" type="pubmed-gene">
<property name="src.data.dir" location="/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/publication"/>
<property name="src.data.dir" location="/mnt/gaia/home/projects/legiolist/data/pre-process-data"/>
<property name="pubmed.organisms" value="272624 297246 297245 400673 423212 1046632 661367 451 449 96230"/>
<property name="src.data.dir.includes" value="gene2pubmed"/>
</source>
......@@ -267,6 +308,8 @@
<post-process name="create-chromosome-locations-and-lengths"/>
<post-process name="do-sources" />
<post-process name="populate-child-features"/>
<post-process name="create-location-range-index"/>
<post-process name="create-overlap-view" />
<post-process name="create-attribute-indexes" />
<post-process name="create-search-index" />
<post-process name="summarise-objectstore" />
......
#!/bin/bash
#
# from https://github.com/sergiocontrino/intermine/blob/rum/bio/scripts/rumen/autoget.sh
# default usage: automine.sh
#
# sc 02/17
#
# TODO:- timestamp downloads?
# - allow cs list of taxid?
# - rm empty files?
#
#
# check the host
#
ABHOST="rumenmine-dev"
DATADIR=/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/uniprot/ # default datadir (on rumenmine-dev)
HOST=$(hostname -s)
#echo $HOST
if [ "$HOST" != "$ABHOST" ]; then
DATADIR=/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/uniprot/
fi
#echo $DATADIR
SRCDIR=$DATADIR/uniprot
LOGDIR=$SRCDIR/logs
FTPURL=http://www.uniprot.org/uniprot
PROPDIR=$HOME/.intermine
SCRIPTDIR=./scripts
ARKDIR=/micklem/releases/modmine
RECIPIENTS=contrino@intermine.org
# set minedir and check that modmine in path
MINEDIR=$PWD
BUILDDIR=$MINEDIR/integrate/build
# default settings: edit with care
V=nv # non-verbose mode
INFILE= # not using a given list of submissions
INTERACT=n # y: step by step interaction
WGET=y # use wget to get files from ftp
DB=a # no db specified (do them all)
S=uniprot # default source
progname=$0
function usage() {
cat <<EOF
Usage:
$progname [-S source ] [-f file_name] [-i] [-v] [-s] [-t] taxId
-f file_name: using a given list of submissions
-i: interactive mode
-v: verbode mode
-S: source [uniprot]
Parameters: you can process
a single organism (taxId) (e.g. automine.sh 9913 )
a list of organisms (taxId) in an input file (e.g. automine.sh -v -f infile )
examples:
EOF
exit 0
}
while getopts ":if:vS:st" opt; do
case $opt in
f) INFILE=$OPTARG ;;
i)
echo "- Interactive mode"
INTERACT=y
;;
v)
echo "- Verbose mode"
V=v
;;
s)
echo "- Only Swiss-Prot"
DB=s
;;
t)
echo "- Only TrEMBL"
DB=t
;;
S)
S=$OPTARG
echo "- using source $S"
;;
h) usage ;;
\?) usage ;;
esac
done
shift $(($OPTIND - 1))
# some input checking
if [ -n "$INFILE" ]; then
if [ ! -s "$INFILE" ]; then
echo "ERROR, $INFILE: no such file?"
echo
exit 1
fi
SHOW="$(cat $INFILE | tr '[\n]' '[,]')"
echo -n "- Using given list of taxids: "
echo $SHOW
fi
echo "==================================="
echo "GETTING $S FILES "
echo "==================================="
if [ -n "$1" ]; then
SUB=$1
#echo "Processing taxon $SUB.."
fi
function interact() {
# if testing, wait here before continuing
if [ $INTERACT = "y" ]; then
echo "$1"
echo "Press return to continue (^C to exit).."
echo -n "->"
read
fi
}
function getFiles() {
#---------------------------------------
# getting the xml from ftp site
#---------------------------------------
if [ -n "$SUB" ]; then
# doing only 1 sub
LOOPVAR="$SUB"
elif [ -n "$INFILE" ]; then
# use the list provided in a file
LOOPVAR=$(cat $INFILE)
else
echo "ERROR: please enter input file location or desired taxon Id."
fi
cd $SRCDIR
#interact "START WGET NOW"
for sub in $LOOPVAR; do
echo "Processing taxon $sub.."
if [ "$DB" = "a" -o "$DB" = "s" ]; then
#wget -O $sub\_uniprot_sprot.xml -$V --progress=dot:mega --no-use-server-timestamps "http://www.uniprot.org/uniprot/?compress=no&query=organism:$sub%20AND%20reviewed:yes&fil=&format=xml"
wget -O $sub\_uniprot_sprot.xml -$V --progress=dot:giga "http://www.uniprot.org/uniprot/?compress=no&query=organism:$sub%20AND%20reviewed:yes&fil=&format=xml"
fi
if [ "$DB" = "a" -o "$DB" = "t" ]; then
#wget -O $sub\_uniprot_trembl.xml -$V --progress=dot:mega --no-use-server-timestamps "http://www.uniprot.org/uniprot/?compress=no&query=organism:$sub%20AND%20reviewed:no&fil=&format=xml"
wget -O $sub\_uniprot_trembl.xml -$V --progress=dot:giga "http://www.uniprot.org/uniprot/?compress=no&query=organism:$sub%20AND%20reviewed:no&fil=&format=xml"
fi
#
# rm files if empty
#
if [ ! -s $sub\_uniprot_sprot.xml ]; then
rm $sub\_uniprot_sprot.xml
# add log
fi
if [ ! -s $sub\_uniprot_trembl.xml ]; then
rm $sub\_uniprot_trembl.xml
# add log
fi
done
}
interact
########################################
#
# MAIN
#
########################################
#---------------------------------------
# get the xml files
#---------------------------------------
#
if [ "$S" = "uniprot" ]; then
getFiles
echo bye!
#interact
else
echo "At the moment the program support only uniprot as a source, farewell.."
echo
fi #if $WGET=y
#!/usr/bin/env python3
from Bio import SeqIO
from pathlib import Path
import click
from itertools import chain
@click.command()
@click.option(
"-d",
"--directory",
type=click.Path(exists=True, dir_okay=True, file_okay=True),
help="Genome input source",
)
@click.option(
"-o", "--output-dir", type=click.Path("wb"), help="path to the output dir"
)
@click.option("-f", "--output-format", type=str, help="output sequence format")
def convert_genomes(directory, output_dir, output_format):
print(output_format)
genomes = gen_path_with_format(directory)
gen_genomes = gen_genomes_with_steam(genomes)
for filename, genome in gen_genomes:
print(filename)
write_genbank(genome, filename, output_dir, output_format)
def write_genbank(genome, filename, directory, sequence_format):
with open(
Path(directory) / (filename + "." + sequence_format), "w"
) as output_handle:
print("-Write file " + filename)
SeqIO.write(genome, output_handle, sequence_format)
def gen_path_with_format(input_str):
input_path = Path(input_str)
if input_path.is_dir():
for genome_path in chain(
map(lambda genome: ("embl", genome), input_path.rglob("*.embl")),
map(lambda genome: ("genbank", genome), input_path.rglob("*.genbank")),
):
yield genome_path
else:
suffix = input_path.suffix
if suffix == ".embl":
yield ("embl", input_path)
else:
yield ("genbank", input_path)
def gen_genomes_with_steam(genomes):
for genome_format, genome in genomes:
yield (str(Path(genome.stem)), SeqIO.parse(genome, genome_format))
if __name__ == "__main__":
convert_genomes()
#!/usr/bin/env python3
import pprint
from BCBio.GFF import GFFExaminer
in_file = "/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/LpPhila/LpPhila.genbank.gff3"
examiner = GFFExaminer()
in_handle = open(in_file)
pprint.pprint(examiner.available_limits(in_handle))
in_handle.close()
#!/bin/bash
# perl -i.bak -pne 's/\tRBS\t/\tShine_Dalgarno_sequence\t/' /home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/LpLens.genbank.gff3
python /home/rplanel/projects/legiolist-intermine/legiolist/scripts/embl-to-genbank.py -d /home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/genbank/ -o /home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/fasta/ -f fasta
\ No newline at end of file
#!/usr/bin/env python3
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from pathlib import Path
import click
from itertools import chain
import re
@click.command()
@click.option(
"-d",
"--directory",
type=click.Path(exists=True, dir_okay=True, file_okay=True),
help="Genome input source",
)
@click.option(
"-o", "--output-dir", type=click.Path("wb"), help="path to the output dir"
)
@click.option("-f", "--output-format", type=str, help="output sequence format")
@click.option("-t", "--feature-type", type=str, help="")
def split_genomes(directory, output_dir, output_format, feature_type):
genomes = gen_path_with_format(directory)
gen_genomes = gen_genomes_with_steam(genomes)
for filename, genome in gen_genomes:
write_sequence(genome, filename, output_dir, output_format, feature_type)
def write_sequence(genome, filename, directory, sequence_format, feature_type):
with open(
Path(directory) / (filename + "." + sequence_format), "w"
) as output_handle:
print("# Write file " + filename)
for record in genome:
for feat in record.features:
if feat.type == feature_type:
feature_seq = feat.extract(record.seq)
seq_record = SeqRecord(feature_seq)
locus_tags = feat.qualifiers.get("locus_tag")
product = feat.qualifiers.get("product")
if locus_tags and len(locus_tags) > 0:
seq_record.id = locus_tags[0]
if product:
p = re.compile(r"^\s+$")
if not p.match(product[0]):
seq_record.description = product[0]
SeqIO.write(seq_record, output_handle, sequence_format)
else:
raise ("No locus tag !!!")
def gen_path_with_format(input_str):
input_path = Path(input_str)