Commit 304efac1 authored by Blaise Li's avatar Blaise Li
Browse files

Summing across a repeat family in shared library.

parent 2693bebd
......@@ -29,7 +29,8 @@ import matplotlib.pyplot as plt
from libhts import make_empty_bigwig, median_ratio_to_pseudo_ref_size_factors, plot_histo
from libworkflows import get_chrom_sizes, cleanup_and_backup
from libworkflows import last_lines, ensure_relative, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded, read_feature_counts, sum_feature_counts
from libworkflows import feature_orientation2stranded
from libworkflows import sum_by_family, read_feature_counts, sum_feature_counts
from smincludes import rules as irules
# Define functions to be used in shell portions
......@@ -625,9 +626,7 @@ rule gather_counts:
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.biotype.endswith("_rmsk_families"):
repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
# Sum the counts for a given repeat family
counts_data = counts_data.assign(family=repeat_families).groupby("family").sum()
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
......
......@@ -27,6 +27,7 @@ from libworkflows import (
feature_orientation2stranded,
get_chrom_sizes,
read_feature_counts,
sum_by_family,
sum_feature_counts,
wc_applied)
# http://sailfish.readthedocs.io/en/master/library_type.html
......@@ -298,9 +299,7 @@ rule gather_counts:
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.biotype.endswith("_rmsk_families"):
repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
# Sum the counts for a given repeat family
counts_data = counts_data.assign(family=repeat_families).groupby("family").sum()
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, na_rep="NA", sep="\t")
......
......@@ -66,6 +66,7 @@ from libworkflows import get_chrom_sizes, column_converter
from libworkflows import strip_split, file_len, last_lines, save_plot, test_na_file
from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded
from libworkflows import sum_by_family
from libworkflows import read_htseq_counts, sum_htseq_counts
from libworkflows import read_feature_counts, sum_feature_counts
from smincludes import rules as irules
......@@ -855,9 +856,7 @@ rule gather_counts:
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.biotype.endswith("_rmsk_families"):
repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
# Sum the counts for a given repeat family
counts_data = counts_data.assign(family=repeat_families).groupby("family").sum()
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
......
......@@ -102,6 +102,7 @@ from libworkflows import wc_applied, ensure_relative, cleanup_and_backup
from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
from libworkflows import strip_split, save_plot, test_na_file, SHELL_FUNCTIONS, warn_context
from libworkflows import feature_orientation2stranded
from libworkflows import sum_by_family
from libworkflows import read_htseq_counts, sum_htseq_counts
from libworkflows import read_intersect_counts, sum_intersect_counts
from libworkflows import read_feature_counts, sum_feature_counts
......@@ -1067,9 +1068,7 @@ rule gather_counts:
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.biotype.endswith("_rmsk_families"):
repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
# Sum the counts for a given repeat family
counts_data = counts_data.assign(family=repeat_families).groupby("family").sum()
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, na_rep="NA", sep="\t")
......
......@@ -137,7 +137,7 @@ from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_dis
from libworkflows import texscape, ensure_relative, cleanup_and_backup
from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
from libworkflows import read_int_from_file, strip_split, file_len, last_lines, save_plot, SHELL_FUNCTIONS
from libworkflows import read_feature_counts, sum_feature_counts, sum_htseq_counts, warn_context
from libworkflows import sum_by_family, read_feature_counts, sum_feature_counts, sum_htseq_counts, warn_context
from smincludes import rules as irules
strip = str.strip
......@@ -1184,9 +1184,7 @@ rule gather_counts:
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.biotype.endswith("_rmsk_families"):
repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
# Sum the counts for a given repeat family
counts_data = counts_data.assign(family=repeat_families).groupby("family").sum()
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, na_rep="NA", sep="\t")
......
......@@ -4,7 +4,7 @@ from .libworkflows import (
last_lines, make_id_list_getter, read_float_from_file, read_int_from_file,
read_feature_counts, read_htseq_counts, read_intersect_counts,
save_plot, strip_split,
sum_feature_counts, sum_htseq_counts, sum_intersect_counts,
sum_by_family, sum_feature_counts, sum_htseq_counts, sum_intersect_counts,
test_na_file,
texscape,
warn_context, wc_applied)
......@@ -274,6 +274,19 @@ def read_intersect_counts(counts_filename):
return pd.DataFrame(index = [], columns = ["gene", "counts"]).set_index("gene")
def sum_by_family(counts_data):
"""
Add a "family" column to *counts_data* and sum the counts for a given
repeat family.
The family column is determined assuming that the index contains the repeat
family name suffixed with a ":" and a number (representing the particular
instance of the repeat).
"""
repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
return counts_data.assign(family=repeat_families).groupby("family").sum()
# http://stackoverflow.com/a/845069/1878788
def file_len(fname):
p = Popen(
......
......@@ -167,7 +167,7 @@ from libhts import plot_paired_scatters, plot_norm_correlations, plot_counts_dis
from libworkflows import texscape, wc_applied, ensure_relative, cleanup_and_backup
from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
from libworkflows import read_int_from_file, strip_split, file_len, last_lines, save_plot, SHELL_FUNCTIONS
from libworkflows import filter_combinator, read_feature_counts, sum_feature_counts, sum_htseq_counts, warn_context
from libworkflows import filter_combinator, read_feature_counts, sum_by_family, sum_feature_counts, sum_htseq_counts, warn_context
from smincludes import rules as irules
strip = str.strip
......@@ -1305,9 +1305,7 @@ rule gather_feature_counts:
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.biotype.endswith("_rmsk_families"):
repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
# Sum the counts for a given repeat family
counts_data = counts_data.assign(family=repeat_families).groupby("family").sum()
counts_data = sum_by_family(counts_data.assign)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
......@@ -1626,9 +1624,7 @@ rule gather_small_RNA_counts:
if wildcards.small_type in {"te_si", "te_siu",
"satel_si", "satel_siu",
"simrep_si", "simrep_siu"}:
repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
# Sum the counts for a given repeat family
counts_data = counts_data.assign(family=repeat_families).groupby("family").sum()
counts_data = sum_by_family(counts_data)
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment