Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Blaise LI
libhts
Commits
f5991cb0
Commit
f5991cb0
authored
Oct 27, 2017
by
Blaise Li
Browse files
Functions to compute union exon lengths.
parent
88cea0dd
Changes
2
Hide whitespace changes
Inline
Side-by-side
libhts/__init__.py
View file @
f5991cb0
from
.libhts
import
do_deseq2
,
median_ratio_to_pseudo_ref_size_factors
,
plot_boxplots
,
plot_counts_distribution
,
plot_lfc_distribution
,
plot_MA
,
plot_norm_correlations
,
size_factor_correlations
,
status_setter
from
.libhts
import
do_deseq2
,
gtf_2_genes_exon_lengths
,
median_ratio_to_pseudo_ref_size_factors
,
plot_boxplots
,
plot_counts_distribution
,
plot_lfc_distribution
,
plot_MA
,
plot_norm_correlations
,
repeat_bed_2_lengths
,
size_factor_correlations
,
status_setter
libhts/libhts.py
View file @
f5991cb0
from
functools
import
reduce
import
warnings
...
...
@@ -20,6 +21,127 @@ as_df = r("as.data.frame")
from
rpy2.rinterface
import
RRuntimeError
from
rpy2.robjects.packages
import
importr
deseq2
=
importr
(
"DESeq2"
)
from
pybedtools
import
BedTool
import
networkx
as
nx
class
Exon
(
object
):
__slots__
=
(
"chrom"
,
"start"
,
"end"
)
def
__init__
(
self
,
chrom
,
start
,
end
):
self
.
chrom
=
chrom
self
.
start
=
start
self
.
end
=
end
def
overlap
(
self
,
other
):
if
self
.
chrom
!=
other
.
chrom
:
return
False
return
(
self
.
start
<=
other
.
start
<
self
.
end
)
or
(
other
.
start
<=
self
.
start
<
other
.
end
)
def
merge
(
self
,
other
):
# Not necessary: can be indirectly linked
#assert overlap(self, other)
return
Exon
(
self
.
chrom
,
min
(
self
.
start
,
other
.
start
),
max
(
self
.
end
,
other
.
end
))
def
__len__
(
self
):
return
self
.
end
-
self
.
start
overlap
=
Exon
.
overlap
merge
=
Exon
.
merge
class
Gene
(
object
):
"""This object contains information obtained from a gtf file."""
__slots__
=
(
"gene_id"
,
"exons"
,
"union_exon_length"
)
def
__init__
(
self
,
gene_id
):
self
.
gene_id
=
gene_id
#self.transcripts = {}
self
.
exons
=
nx
.
Graph
()
self
.
union_exon_length
=
None
#def add_transcript(self, feature):
# the_id = feature.attrs["transcript_id"]
# assert the_id not in self.transcripts
# self.transcripts[the_id] = feature
def
add_exon
(
self
,
feature
):
#the_id = feature.attrs["exon_id"]
#assert the_id not in self.exons
#self.exons[the_id] = feature
exon
=
Exon
(
feature
.
chrom
,
feature
.
start
,
feature
.
end
)
if
exon
not
in
self
.
exons
:
self
.
exons
.
add_node
(
exon
)
# The merging cannot be done on the full BedTool because we dont want
# to merge together exons not belonging to the same gene.
def
set_union_exon_length
(
self
):
"""The exons are used to make a BedTool, which enables convenient merging of
overlapping features. The sum of the lengths of the merged exons is returned."""
if
len
(
self
.
exons
)
==
1
:
# No need to merge when there is only one exon
self
.
union_exon_length
=
len
(
next
(
iter
(
self
.
exons
.
nodes
())))
else
:
# Too slow
#self.union_exon_length = sum(map(
# len, BedTool(self.exons.values()).merge().features()))
#self.union_exon_length = 0
# We group nodes that overlap, and merge them
#overlapping_exons = nx.quotient_graph(self.exons, overlap)
#for node in overlapping_exons.nodes():
# mex = reduce(merge, node)
# self.union_exon_length += len(mex)
self
.
union_exon_length
=
sum
((
len
(
reduce
(
merge
,
node
))
for
node
in
nx
.
quotient_graph
(
self
.
exons
,
overlap
).
nodes
()))
def
gtf_2_genes_exon_lengths
(
gtf_filename
):
"""Returns a pandas DataFrame where union exon lengths are associated to gene IDs."""
gtf_file
=
open
(
gtf_filename
,
"r"
)
gtf
=
BedTool
(
gtf_file
)
genes
=
{}
for
feature
in
gtf
.
features
():
feat_type
=
feature
[
2
]
if
feat_type
!=
"exon"
:
continue
attrs
=
feature
.
attrs
gene_id
=
attrs
[
"gene_id"
]
if
gene_id
not
in
genes
:
genes
[
gene_id
]
=
Gene
(
gene_id
)
gene
=
genes
[
gene_id
]
try
:
gene
.
add_exon
(
feature
)
except
AssertionError
:
# A given exon may be registered for several transcripts, hence several gtf entries
already
=
gene
.
exons
[
feature
.
attrs
[
"exon_id"
]]
assert
already
.
attrs
[
"transcript_id"
]
!=
feature
.
attrs
[
"transcript_id"
]
assert
(
already
.
start
,
already
.
end
)
==
(
feature
.
start
,
feature
.
end
)
for
gene
in
genes
.
values
():
gene
.
set_union_exon_length
()
return
pd
.
DataFrame
(
pd
.
Series
(
{
gene
.
gene_id
:
gene
.
union_exon_length
for
gene
in
genes
.
values
()},
name
=
(
"union_exon_len"
,)).
rename_axis
(
"gene"
))
def
repeat_bed_2_lengths
(
repeat_bed
):
"""Computes the lengths of repeatitive elements in a bed file, grouped by families.
This assumes that the elements have their names composed of the family name,
then a colon, then a number. For instance:
Simple_repeat|Simple_repeat|(TTTTTTG)n:1
Simple_repeat|Simple_repeat|(TTTTTTG)n:2
Simple_repeat|Simple_repeat|(TTTTTTG)n:3
Simple_repeat|Simple_repeat|(TTTTTTG)n:4
-> Simple_repeat|Simple_repeat|(TTTTTTG)n
Returns a DataFrame associating the summed lengths to the family names.
"""
# usecols=[1, 2, 3]: start, end, id
# index_col=2: id (relative to the selected columns)
start_ends
=
pd
.
read_table
(
repeat_bed
,
usecols
=
[
1
,
2
,
3
],
header
=
None
,
index_col
=
2
)
# bed lengths
lens
=
start_ends
[
2
]
-
start_ends
[
1
]
lens
.
name
=
"union_exon_len"
repeat_families
=
[
":"
.
join
(
name
.
split
(
":"
)[:
-
1
])
for
name
in
start_ends
.
index
]
# The reads assigned to a repeated element can come
# from the summed length of all the members of the family
# We call this "gene" for convenience and compatibility
return
pd
.
DataFrame
(
lens
).
assign
(
gene
=
repeat_families
).
groupby
(
"gene"
).
sum
()
def
do_deseq2
(
cond_names
,
conditions
,
counts_data
,
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment