Commit cbc2127f authored by Remi  PLANEL's avatar Remi PLANEL
Browse files

move script to sources

parent 23f6a5a3
#!/bin/bash
#
# from https://github.com/sergiocontrino/intermine/blob/rum/bio/scripts/rumen/autoget.sh
# default usage: automine.sh
#
# sc 02/17
#
# TODO:- timestamp downloads?
# - allow cs list of taxid?
# - rm empty files?
#
#
# check the host
#
ABHOST="rumenmine-dev"
DATADIR=/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/uniprot/ # default datadir (on rumenmine-dev)
HOST=$(hostname -s)
#echo $HOST
if [ "$HOST" != "$ABHOST" ]; then
DATADIR=/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/uniprot/
fi
#echo $DATADIR
SRCDIR=$DATADIR/uniprot
LOGDIR=$SRCDIR/logs
FTPURL=http://www.uniprot.org/uniprot
PROPDIR=$HOME/.intermine
SCRIPTDIR=./scripts
ARKDIR=/micklem/releases/modmine
RECIPIENTS=contrino@intermine.org
# set minedir and check that modmine in path
MINEDIR=$PWD
BUILDDIR=$MINEDIR/integrate/build
# default settings: edit with care
V=nv # non-verbose mode
INFILE= # not using a given list of submissions
INTERACT=n # y: step by step interaction
WGET=y # use wget to get files from ftp
DB=a # no db specified (do them all)
S=uniprot # default source
progname=$0
function usage() {
cat <<EOF
Usage:
$progname [-S source ] [-f file_name] [-i] [-v] [-s] [-t] taxId
-f file_name: using a given list of submissions
-i: interactive mode
-v: verbode mode
-S: source [uniprot]
Parameters: you can process
a single organism (taxId) (e.g. automine.sh 9913 )
a list of organisms (taxId) in an input file (e.g. automine.sh -v -f infile )
examples:
EOF
exit 0
}
while getopts ":if:vS:st" opt; do
case $opt in
f) INFILE=$OPTARG ;;
i)
echo "- Interactive mode"
INTERACT=y
;;
v)
echo "- Verbose mode"
V=v
;;
s)
echo "- Only Swiss-Prot"
DB=s
;;
t)
echo "- Only TrEMBL"
DB=t
;;
S)
S=$OPTARG
echo "- using source $S"
;;
h) usage ;;
\?) usage ;;
esac
done
shift $(($OPTIND - 1))
# some input checking
if [ -n "$INFILE" ]; then
if [ ! -s "$INFILE" ]; then
echo "ERROR, $INFILE: no such file?"
echo
exit 1
fi
SHOW="$(cat $INFILE | tr '[\n]' '[,]')"
echo -n "- Using given list of taxids: "
echo $SHOW
fi
echo "==================================="
echo "GETTING $S FILES "
echo "==================================="
if [ -n "$1" ]; then
SUB=$1
#echo "Processing taxon $SUB.."
fi
function interact() {
# if testing, wait here before continuing
if [ $INTERACT = "y" ]; then
echo "$1"
echo "Press return to continue (^C to exit).."
echo -n "->"
read
fi
}
function getFiles() {
#---------------------------------------
# getting the xml from ftp site
#---------------------------------------
if [ -n "$SUB" ]; then
# doing only 1 sub
LOOPVAR="$SUB"
elif [ -n "$INFILE" ]; then
# use the list provided in a file
LOOPVAR=$(cat $INFILE)
else
echo "ERROR: please enter input file location or desired taxon Id."
fi
cd $SRCDIR
#interact "START WGET NOW"
for sub in $LOOPVAR; do
echo "Processing taxon $sub.."
if [ "$DB" = "a" -o "$DB" = "s" ]; then
#wget -O $sub\_uniprot_sprot.xml -$V --progress=dot:mega --no-use-server-timestamps "http://www.uniprot.org/uniprot/?compress=no&query=organism:$sub%20AND%20reviewed:yes&fil=&format=xml"
wget -O $sub\_uniprot_sprot.xml -$V --progress=dot:giga "http://www.uniprot.org/uniprot/?compress=no&query=organism:$sub%20AND%20reviewed:yes&fil=&format=xml"
fi
if [ "$DB" = "a" -o "$DB" = "t" ]; then
#wget -O $sub\_uniprot_trembl.xml -$V --progress=dot:mega --no-use-server-timestamps "http://www.uniprot.org/uniprot/?compress=no&query=organism:$sub%20AND%20reviewed:no&fil=&format=xml"
wget -O $sub\_uniprot_trembl.xml -$V --progress=dot:giga "http://www.uniprot.org/uniprot/?compress=no&query=organism:$sub%20AND%20reviewed:no&fil=&format=xml"
fi
#
# rm files if empty
#
if [ ! -s $sub\_uniprot_sprot.xml ]; then
rm $sub\_uniprot_sprot.xml
# add log
fi
if [ ! -s $sub\_uniprot_trembl.xml ]; then
rm $sub\_uniprot_trembl.xml
# add log
fi
done
}
interact
########################################
#
# MAIN
#
########################################
#---------------------------------------
# get the xml files
#---------------------------------------
#
if [ "$S" = "uniprot" ]; then
getFiles
echo bye!
#interact
else
echo "At the moment the program support only uniprot as a source, farewell.."
echo
fi #if $WGET=y
#!/usr/bin/env python3
from Bio import SeqIO
from pathlib import Path
import click
from itertools import chain
@click.command()
@click.option(
"-d",
"--directory",
type=click.Path(exists=True, dir_okay=True, file_okay=True),
help="Genome input source",
)
@click.option(
"-o", "--output-dir", type=click.Path("wb"), help="path to the output dir"
)
@click.option("-f", "--output-format", type=str, help="output sequence format")
def convert_genomes(directory, output_dir, output_format):
print(output_format)
genomes = gen_path_with_format(directory)
gen_genomes = gen_genomes_with_steam(genomes)
for filename, genome in gen_genomes:
print(filename)
write_genbank(genome, filename, output_dir, output_format)
def write_genbank(genome, filename, directory, sequence_format):
with open(
Path(directory) / (filename + "." + sequence_format), "w"
) as output_handle:
print("-Write file " + filename)
SeqIO.write(genome, output_handle, sequence_format)
def gen_path_with_format(input_str):
input_path = Path(input_str)
if input_path.is_dir():
for genome_path in chain(
map(lambda genome: ("embl", genome), input_path.rglob("*.embl")),
map(lambda genome: ("genbank", genome), input_path.rglob("*.genbank")),
):
yield genome_path
else:
suffix = input_path.suffix
if suffix == ".embl":
yield ("embl", input_path)
else:
yield ("genbank", input_path)
def gen_genomes_with_steam(genomes):
for genome_format, genome in genomes:
yield (str(Path(genome.stem)), SeqIO.parse(genome, genome_format))
if __name__ == "__main__":
convert_genomes()
#!/usr/bin/env python3
import pprint
from BCBio.GFF import GFFExaminer
in_file = "/home/rplanel/projects/legiolist-intermine/data/legiolist/genomes/gff/legiolist-gff/LpPhila/LpPhila.genbank.gff3"
examiner = GFFExaminer()
in_handle = open(in_file)
pprint.pprint(examiner.available_limits(in_handle))
in_handle.close()
#!/usr/bin/env python3
import re
from pathlib import Path
import click
class Error(Exception):
"""Base class for other exceptions"""
pass
class NoFeatureId(Error):
"""Raised when no feature id is found"""
pass
class NoLocusTag(Error):
"""Raised when no feature id is found"""
pass
class NoGeneFeature(Error):
"""Raised when no feature id is found"""
pass
class NoMRNAFeature(Error):
"""Raised when no feature id is found"""
pass
class NoCDSFeature(Error):
"""Raised when no feature id is found"""
pass
@click.command()
@click.option(
"-g",
"--gff",
type=click.Path(exists=True, dir_okay=True, file_okay=True),
help="GFF genome",
)
@click.option(
"-o", "--output-dir", type=click.Path("wb"), help="path to the output dir"
)
def link_features(gff, output_dir):
input_path = Path(gff)
feature_index = index_features(input_path)
write_gff(feature_index, input_path, output_dir)
# add_feature_parent(input_path, feature_index, output_dir)
def write_gff(feature_index, input_path, output_dir):
filename = input_path.stem
ext = input_path.suffix
output_path = Path(output_dir) / (filename + "-with-parent" + ext)
with open(output_path, "w") as output_handle:
for feat_group_id, feat_group in feature_index.items():
# check if ncRNA
if feat_group_id.startswith("lppnc"):
write_ncrna(feat_group_id, feat_group, output_handle)
elif "gene" in feat_group and "mRNA" in feat_group:
write_gene(feat_group_id, feat_group, output_handle)
else:
for feat_type, lines in feat_group.items():
if feat_type == "gene":
feat_id = feat_group_id
output_handle.write(
clean_entry_no_parent(lines[0], feat_id, feat_group_id)
+ "\n"
)
else:
for i, line in enumerate(lines):
feat_id = feat_group_id + "." + feat_type + "." + str(i + 1)
output_handle.write(
clean_entry_no_parent(line, feat_id, feat_group_id)
+ "\n"
)
print("DONE")
def write_ncrna(feature_group_id, feature_group, output_handle):
nc_rna_id = None
if "CDS" in feature_group:
feat_type = "NcRNA"
cds_line = feature_group["CDS"][0]
columns = cds_line.split("\t")
columns[2] = feat_type
nc_rna_id = feature_group_id + "." + feat_type
entry = clean_entry("\t".join(columns), None, nc_rna_id, feature_group_id,)
output_handle.write(entry + "\n")
else:
raise Exception(f"No CDS for {feature_group_id}")
if "promoter" in feature_group:
for i, line in enumerate(feature_group["promoter"]):
promoter_id = feature_group_id + ".promoter." + str(i + 1)
entry = clean_entry(line, nc_rna_id, promoter_id, feature_group_id)
output_handle.write(entry + "\n")
if "TSS" in feature_group:
for i, line in enumerate(feature_group["TSS"]):
tss_id = feature_group_id + ".TSS." + str(i + 1)
entry = clean_entry(line, nc_rna_id, tss_id, feature_group_id)
output_handle.write(entry + "\n")
def write_gene(feat_group_id, feat_group, output_handle):
gene_id = None
mrna_id = None
if "gene" in feat_group:
gene = feat_group["gene"][0]
gene_id = feat_group_id
entry = clean_entry(gene, None, gene_id, feat_group_id)
output_handle.write(entry + "\n")
else:
raise Exception(f"missing gene form {feat_group_id}")
if "mRNA" in feat_group:
line = feat_group["mRNA"][0]
mrna_id = feat_group_id + ".mRNA"
if gene_id:
entry = clean_entry(line, gene_id, mrna_id, feat_group_id)
output_handle.write(entry + "\n")
else:
raise Exception("no mRNA")
else:
print(f"missing mRNA form {feat_group_id}")
raise NoMRNAFeature(feat_group_id)
if "CDS" in feat_group:
line = feat_group["CDS"][0]
cds_id = feat_group_id + ".CDS"
entry = clean_entry(line, mrna_id, cds_id, feat_group_id)
output_handle.write(entry + "\n")
else:
raise NoCDSFeature(feat_group_id)
if "promoter" in feat_group:
for i, line in enumerate(feat_group["promoter"]):
if i == 0:
trna_id = feat_group_id + ".promoter"
else:
trna_id = feat_group_id + ".promoter." + str(i + 1)
entry = clean_entry(line, mrna_id, trna_id, feat_group_id)
output_handle.write(entry + "\n")
if "TSS" in feat_group:
for i, line in enumerate(feat_group["TSS"]):
if i == 0:
tss_id = feat_group_id + ".TSS"
else:
tss_id = feat_group_id + ".TSS." + str(i + 1)
entry = clean_entry(line, mrna_id, tss_id, feat_group_id)
output_handle.write(entry + "\n")
def clean_entry_no_parent(line, feat_id, locus_tag):
clean_line = []
columns = line.split("\t")
features_str = columns[-1]
features = features_str.split(";")
for feat in features:
if feat.startswith("ID=") or feat.startswith("locus_tag="):
continue
else:
clean_line.append(feat)
clean_line.append("ID=" + feat_id)
clean_line.append("locus_tag=" + locus_tag)
columns[-1] = ";".join(clean_line)
return "\t".join(columns)
def clean_entry(line, parent_id, feat_id, locus_tag):
clean_line = []
columns = line.split("\t")
features_str = columns[-1]
features = features_str.split(";")
for feat in features:
if (
feat.startswith("Parent=")
or feat.startswith("ID=")
or feat.startswith("locus_tag=")
):
continue
else:
clean_line.append(feat)
clean_line.append("ID=" + feat_id)
clean_line.append("locus_tag=" + locus_tag)
if parent_id:
clean_line.append("Parent=" + parent_id)
columns[-1] = ";".join(clean_line)
return "\t".join(columns)
def add_feature_parent(input_path, feature_index, output_dir):
filename = input_path.stem
ext = input_path.suffix
with open(
Path(output_dir) / (filename + "-with-parent" + ext), "w"
) as output_handle:
with input_path.open() as input_gff:
for line in input_gff:
line = line.strip()
if line.startswith(">"):
break
if line.startswith("#"):
output_handle.write(line + "\n")
else:
columns = line.split("\t")
feature_name = columns[2]
# link TSS
if feature_name == "TSS":
# print(line)
feature_str = columns[-1]
try:
feat_id = get_feature_id(feature_str)
trim_feat_id = get_trimmed_id(feat_id)
features = feature_index[trim_feat_id]
# print(trim_feat_id)
# print(features)
if "mRNA" in features:
output_handle.write(
line
+ ";Parent="
+ feature_index[trim_feat_id]["mRNA"]["id"]
+ "\n"
)
else:
output_handle.write(line + "\n")
except NoFeatureId:
# try to get parent id
parent_id = get_parent_id(feature_str)
trim_parent_id = get_trimmed_id(parent_id)
parent_features = feature_index[trim_parent_id]
if "TSS" in parent_features:
output_handle.write(
line
+ ";ID="
+ parent_features["TSS"]["id"]
+ ".1"
+ "\n"
)
else:
output_handle.write(
line + ";ID=" + trim_parent_id + ".TSS" + "\n"
)
# print(trim_id)
# gene_id = feature_index[trim_id]["gene"]
# output_handle.write(line + ";Parent=" + gene_id + "\n")
# print("tot")
else:
output_handle.write(line + "\n")
# feature_column =
def get_parent_id(features_str):
features = features_str.split(";")
for feat in features:
if feat.startswith("Parent="):
return feat.split("=")[1]
# return None
raise NoFeatureId(f"no parent for {features_str}")
def index_features(input_path):
feature_index = dict()
def fill_index(feat_group_id, feature_name, line):
if feat_group_id:
if feat_group_id in feature_index:
feat_group = feature_index[feat_group_id]
if feature_name in feat_group:
features = feat_group[feature_name]
features.append(line)
else:
feat_group[feature_name] = [line]
# feat_group.update(
# {feature_name: {feat_id: {"id": feat_id, "line": line}}}
# )
else:
feature_index.update({feat_group_id: {feature_name: [line]}})
with input_path.open() as input_gff:
for line in input_gff:
line = line.strip()
if line.startswith(">"):
break
if not line.startswith("#"):
columns = line.split("\t")
if len(columns) == 9:
feature_name = columns[2]
features_str = columns[-1]
feat_group_id = None
try:
feat_group_id = get_locus_tag(features_str)
fill_index(feat_group_id, feature_name, line)
except NoLocusTag:
try:
feat_id = get_feature_id(features_str)
feat_group_id = get_trimmed_id(feat_id)
fill_index(feat_group_id, feature_name, line)
continue
except NoFeatureId:
parent_id = get_parent_id(features_str)
feat_group_id = get_trimmed_id(parent_id)
fill_index(feat_group_id, feature_name, line)
continue
else:
raise Exception(
f"Column lenght not expected. should be 9. Got {len(columns)} column for line : {line}"
)
# print(feature_index)
return feature_index
def get_locus_tag(features_str):
features = features_str.split(";")
for feat in features:
if feat.startswith("locus_tag="):
return feat.split("=")[1]
# return None
raise NoLocusTag(f"no locus tag for: {features_str}")
def get_feature_id(features_str):
features = features_str.split(";")