diff --git a/scripts/extract_annot_start.py b/scripts/extract_annot_start.py new file mode 100755 index 0000000000000000000000000000000000000000..10b3483994df331bc896d97132fa1f692535fd3f --- /dev/null +++ b/scripts/extract_annot_start.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# Copyright (C) 2020 Blaise Li +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Extracts the first N positions from the annotations provided in a bed file. + +The result is written in bed format on the standard output. +No bounds checks are performed. Some resulting bed entries might exceed +chromosome boundaries. +""" + +from argparse import ( + ArgumentParser, + ArgumentDefaultsHelpFormatter) +import sys + + +def main(): + """Run the command-line script.""" + parser = ArgumentParser( + description=__doc__, + formatter_class=ArgumentDefaultsHelpFormatter) + parser.add_argument( + "-b", "--bedfile", + required=True, + help="Input bed file.") + parser.add_argument( + "-s", "--start_size", + type=int, + default=200, + help="Number of positions to extract.") + parser.add_argument( + "-k", "--keep_short", + help="Set this option to keep annotations that are too short.", + action="store_true") + args = parser.parse_args() + + start_size = args.start_size + keep_short = args.keep_short + nb_too_short = 0 + with open(args.bedfile) as bedfile: + for line in bedfile: + (chrom, start, end, name, score, strand) = line.strip().split("\t") + if int(end) - int(start) < start_size: + nb_too_short += 1 + if keep_short: + sys.stderr.write( + "Extracted fragment will be longer " + f"than annotation size for {name}\n") + else: + continue + if strand == "-": + print( + chrom, int(end) - start_size, int(end), + name, score, strand, sep="\t") + else: + print( + chrom, int(start), int(start) + start_size, + name, score, strand, sep="\t") + if nb_too_short: + sys.stderr.write( + f"{nb_too_short} annotations were shorter " + "than the extracted fragment.\n") + return 0 + + +if __name__ == "__main__": + sys.exit(main())