Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
L
libhts
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Blaise LI
libhts
Commits
f5991cb0
Commit
f5991cb0
authored
7 years ago
by
Blaise Li
Browse files
Options
Downloads
Patches
Plain Diff
Functions to compute union exon lengths.
parent
88cea0dd
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
libhts/__init__.py
+1
-1
1 addition, 1 deletion
libhts/__init__.py
libhts/libhts.py
+122
-0
122 additions, 0 deletions
libhts/libhts.py
with
123 additions
and
1 deletion
libhts/__init__.py
+
1
−
1
View file @
f5991cb0
from
.libhts
import
do_deseq2
,
median_ratio_to_pseudo_ref_size_factors
,
plot_boxplots
,
plot_counts_distribution
,
plot_lfc_distribution
,
plot_MA
,
plot_norm_correlations
,
size_factor_correlations
,
status_setter
from
.libhts
import
do_deseq2
,
gtf_2_genes_exon_lengths
,
median_ratio_to_pseudo_ref_size_factors
,
plot_boxplots
,
plot_counts_distribution
,
plot_lfc_distribution
,
plot_MA
,
plot_norm_correlations
,
repeat_bed_2_lengths
,
size_factor_correlations
,
status_setter
This diff is collapsed.
Click to expand it.
libhts/libhts.py
+
122
−
0
View file @
f5991cb0
from
functools
import
reduce
import
warnings
import
warnings
...
@@ -20,6 +21,127 @@ as_df = r("as.data.frame")
...
@@ -20,6 +21,127 @@ as_df = r("as.data.frame")
from
rpy2.rinterface
import
RRuntimeError
from
rpy2.rinterface
import
RRuntimeError
from
rpy2.robjects.packages
import
importr
from
rpy2.robjects.packages
import
importr
deseq2
=
importr
(
"
DESeq2
"
)
deseq2
=
importr
(
"
DESeq2
"
)
from
pybedtools
import
BedTool
import
networkx
as
nx
class
Exon
(
object
):
__slots__
=
(
"
chrom
"
,
"
start
"
,
"
end
"
)
def
__init__
(
self
,
chrom
,
start
,
end
):
self
.
chrom
=
chrom
self
.
start
=
start
self
.
end
=
end
def
overlap
(
self
,
other
):
if
self
.
chrom
!=
other
.
chrom
:
return
False
return
(
self
.
start
<=
other
.
start
<
self
.
end
)
or
(
other
.
start
<=
self
.
start
<
other
.
end
)
def
merge
(
self
,
other
):
# Not necessary: can be indirectly linked
#assert overlap(self, other)
return
Exon
(
self
.
chrom
,
min
(
self
.
start
,
other
.
start
),
max
(
self
.
end
,
other
.
end
))
def
__len__
(
self
):
return
self
.
end
-
self
.
start
overlap
=
Exon
.
overlap
merge
=
Exon
.
merge
class
Gene
(
object
):
"""
This object contains information obtained from a gtf file.
"""
__slots__
=
(
"
gene_id
"
,
"
exons
"
,
"
union_exon_length
"
)
def
__init__
(
self
,
gene_id
):
self
.
gene_id
=
gene_id
#self.transcripts = {}
self
.
exons
=
nx
.
Graph
()
self
.
union_exon_length
=
None
#def add_transcript(self, feature):
# the_id = feature.attrs["transcript_id"]
# assert the_id not in self.transcripts
# self.transcripts[the_id] = feature
def
add_exon
(
self
,
feature
):
#the_id = feature.attrs["exon_id"]
#assert the_id not in self.exons
#self.exons[the_id] = feature
exon
=
Exon
(
feature
.
chrom
,
feature
.
start
,
feature
.
end
)
if
exon
not
in
self
.
exons
:
self
.
exons
.
add_node
(
exon
)
# The merging cannot be done on the full BedTool because we dont want
# to merge together exons not belonging to the same gene.
def
set_union_exon_length
(
self
):
"""
The exons are used to make a BedTool, which enables convenient merging of
overlapping features. The sum of the lengths of the merged exons is returned.
"""
if
len
(
self
.
exons
)
==
1
:
# No need to merge when there is only one exon
self
.
union_exon_length
=
len
(
next
(
iter
(
self
.
exons
.
nodes
())))
else
:
# Too slow
#self.union_exon_length = sum(map(
# len, BedTool(self.exons.values()).merge().features()))
#self.union_exon_length = 0
# We group nodes that overlap, and merge them
#overlapping_exons = nx.quotient_graph(self.exons, overlap)
#for node in overlapping_exons.nodes():
# mex = reduce(merge, node)
# self.union_exon_length += len(mex)
self
.
union_exon_length
=
sum
((
len
(
reduce
(
merge
,
node
))
for
node
in
nx
.
quotient_graph
(
self
.
exons
,
overlap
).
nodes
()))
def
gtf_2_genes_exon_lengths
(
gtf_filename
):
"""
Returns a pandas DataFrame where union exon lengths are associated to gene IDs.
"""
gtf_file
=
open
(
gtf_filename
,
"
r
"
)
gtf
=
BedTool
(
gtf_file
)
genes
=
{}
for
feature
in
gtf
.
features
():
feat_type
=
feature
[
2
]
if
feat_type
!=
"
exon
"
:
continue
attrs
=
feature
.
attrs
gene_id
=
attrs
[
"
gene_id
"
]
if
gene_id
not
in
genes
:
genes
[
gene_id
]
=
Gene
(
gene_id
)
gene
=
genes
[
gene_id
]
try
:
gene
.
add_exon
(
feature
)
except
AssertionError
:
# A given exon may be registered for several transcripts, hence several gtf entries
already
=
gene
.
exons
[
feature
.
attrs
[
"
exon_id
"
]]
assert
already
.
attrs
[
"
transcript_id
"
]
!=
feature
.
attrs
[
"
transcript_id
"
]
assert
(
already
.
start
,
already
.
end
)
==
(
feature
.
start
,
feature
.
end
)
for
gene
in
genes
.
values
():
gene
.
set_union_exon_length
()
return
pd
.
DataFrame
(
pd
.
Series
(
{
gene
.
gene_id
:
gene
.
union_exon_length
for
gene
in
genes
.
values
()},
name
=
(
"
union_exon_len
"
,)).
rename_axis
(
"
gene
"
))
def
repeat_bed_2_lengths
(
repeat_bed
):
"""
Computes the lengths of repeatitive elements in a bed file, grouped by families.
This assumes that the elements have their names composed of the family name,
then a colon, then a number. For instance:
Simple_repeat|Simple_repeat|(TTTTTTG)n:1
Simple_repeat|Simple_repeat|(TTTTTTG)n:2
Simple_repeat|Simple_repeat|(TTTTTTG)n:3
Simple_repeat|Simple_repeat|(TTTTTTG)n:4
-> Simple_repeat|Simple_repeat|(TTTTTTG)n
Returns a DataFrame associating the summed lengths to the family names.
"""
# usecols=[1, 2, 3]: start, end, id
# index_col=2: id (relative to the selected columns)
start_ends
=
pd
.
read_table
(
repeat_bed
,
usecols
=
[
1
,
2
,
3
],
header
=
None
,
index_col
=
2
)
# bed lengths
lens
=
start_ends
[
2
]
-
start_ends
[
1
]
lens
.
name
=
"
union_exon_len
"
repeat_families
=
[
"
:
"
.
join
(
name
.
split
(
"
:
"
)[:
-
1
])
for
name
in
start_ends
.
index
]
# The reads assigned to a repeated element can come
# from the summed length of all the members of the family
# We call this "gene" for convenience and compatibility
return
pd
.
DataFrame
(
lens
).
assign
(
gene
=
repeat_families
).
groupby
(
"
gene
"
).
sum
()
def
do_deseq2
(
cond_names
,
conditions
,
counts_data
,
def
do_deseq2
(
cond_names
,
conditions
,
counts_data
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment