Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
bioinfo_utils
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Blaise LI
bioinfo_utils
Commits
40d0e5e1
Commit
40d0e5e1
authored
Feb 14, 2022
by
Blaise Li
Browse files
Options
Downloads
Patches
Plain Diff
Add new sRNA category "pimi22GtRF".
parent
d817a7ef
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
requirements.txt
+1
-1
1 addition, 1 deletion
requirements.txt
small_RNA-seq/small_RNA-seq.snakefile
+31
-17
31 additions, 17 deletions
small_RNA-seq/small_RNA-seq.snakefile
with
32 additions
and
18 deletions
requirements.txt
+
1
−
1
View file @
40d0e5e1
...
...
@@ -18,7 +18,7 @@ libdeseq @ git+https://gitlab.pasteur.fr/bli/libdeseq.git@196ee3d15a125fad0d8212
libhts
@
git+https://gitlab.pasteur.fr/bli/libhts.git@5ac2b62e23a1b28f54d1ce2a483c8df082dfbebc
libreads
@
git+https://gitlab.pasteur.fr/bli/libreads.git@91db379cd379f8f12fccdd3840d4369b7f09d444
libriboseq
@
git+https://gitlab.pasteur.fr/bli/libriboseq.git@95a0837ea703054a98f2f3b098818000828c6f8f
libsmallrna
@
git+https://gitlab.pasteur.fr/bli/libsmallrna.git@
6438ba5a2489db60401160099c2ea358718c9b80
libsmallrna
@
git+https://gitlab.pasteur.fr/bli/libsmallrna.git@
f09cb493959dd8c944b5bea3ec14ccb9baf7bd4c
libworkflows
@
git+https://gitlab.pasteur.fr/bli/libworkflows.git@b29b854ff1db6c87386007808286207b8af11b9d
mappy
==2.17
matplotlib
==3.3.2
...
...
...
...
This diff is collapsed.
Click to expand it.
small_RNA-seq/small_RNA-seq.snakefile
+
31
−
17
View file @
40d0e5e1
...
...
@@ -251,7 +251,7 @@ assert set(DE_TYPES) <= set(SMALL_TYPES + JOINED_SMALL_TYPES), "%s\n%s" % (", ".
# TODO: update cross_HTS with pimi22G
# TODO: what kind of pimi22G ? -> pi, mi and si_22G, but not siu_22G
# TODO: add tRFs either in pimi22G or alone? If alone, check that label_order in plot_fold_heatmap is correct
IP_TYPES = [f"pimi{SI_MIN}G", f"siu_{SI_MIN}G", f"prot_si_{SI_MIN}G", f"ri_si_{SI_MIN}G",
IP_TYPES = [f"pimi{SI_MIN}G",
f"pimi{SI_MIN}GtRF",
f"siu_{SI_MIN}G", f"prot_si_{SI_MIN}G", f"ri_si_{SI_MIN}G",
f"siu_{SI_MAX}G", f"prot_si_{SI_MAX}G", f"ri_si_{SI_MAX}G"]
assert set(IP_TYPES) <= set(
SMALL_TYPES + [f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G"] + JOINED_SMALL_TYPES), ", ".join(IP_TYPES)
...
...
@@ -842,6 +842,7 @@ else:
if IP_CONTRASTS:
all_contrasts_folds = [
OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"pimi{SI_MIN}G_mean_log2_RPM_fold.txt"),
OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"pimi{SI_MIN}GtRF_mean_log2_RPM_fold.txt"),
# To have RPM (folds) for transgenes (which are not in prot_si category)
OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", f"all_si_{SI_MIN}G_mean_log2_RPM_fold.txt")]
ip_fold_boxplots_by_contrast = expand(
...
...
@@ -923,17 +924,18 @@ rule all:
# feature_counts_dir,
# "all_{small_type}_{mapping_type}_{biotype}_{orientation}_transcript_counts.txt"),
# small_type=READ_TYPES_FOR_MAPPING, mapping_type=[f"on_{genome}"], biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS),
# Note (14/02/2022): no pi here: Why ? -> Let's add pi while we're adding missing tRF
expand(
OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
small_type=["
mi
", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G"]),
small_type=["
pi", "mi", "tRF
", *SI_TYPES, *SIU_TYPES,
f"pimi{SI_MIN}G",
f"pimi{SI_MIN}G
tRF
"]),
# piRNA and satel_siu raise ValueError: `dataset` input should have multiple elements when plotting
# simrep_siu raise TypeError: Empty 'DataFrame': no numeric data to plot
expand(
OPJ("figures", "{small_type}_norm_correlations.pdf"),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G"]),
small_type=["mi", *SI_TYPES, *SIU_TYPES,
f"pimi{SI_MIN}G",
f"pimi{SI_MIN}G
tRF
"]),
expand(
OPJ("figures", "{small_type}_norm_counts_distrib.pdf"),
small_type=["mi", *SI_TYPES, *SIU_TYPES, f"pimi{SI_MIN}G"]),
small_type=["mi", *SI_TYPES, *SIU_TYPES,
f"pimi{SI_MIN}G",
f"pimi{SI_MIN}G
tRF
"]),
#absolute = "/pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/includes/link_raw_data.rules"
#relative_include_path = "../snakemake_wrappers/includes/link_raw_data.snakefile"
...
...
@@ -1932,8 +1934,8 @@ rule join_all_sisiu_counts:
#TODO: add tRF, then change category name
rule join_pimi22G_counts:
f"""concat si_{SI_MIN}G with mi and pi into pimi{SI_MIN}G"""
rule join_pimi22G
tRF
_counts:
f"""concat si_{SI_MIN}G with mi and pi into pimi{SI_MIN}G
(a.k.a pisimi) and then with tRF into pimi{SI_MIN}GtRF (a.k.a pisimitRF)
"""
input:
pi_counts_table = OPJ(
annot_counts_dir,
...
...
@@ -1944,10 +1946,17 @@ rule join_pimi22G_counts:
mi_counts_table = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "mi_counts.txt"),
tRF_counts_table = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "tRF_counts.txt"),
output:
counts_table = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "pisimi_counts.txt"),
# Also generate pisimitRF_counts.txt
counts_table_plus_tRF = OPJ(
annot_counts_dir,
f"all_{size_selected}_on_{genome}", "pisimitRF_counts.txt"),
run:
pi_counts_data = pd.read_table(input.pi_counts_table, index_col="gene")
si_22G_counts_data = pd.read_table(input.si_22G_counts_table, index_col="gene")
...
...
@@ -1955,6 +1964,10 @@ rule join_pimi22G_counts:
counts_data = pd.concat([pi_counts_data, si_22G_counts_data, mi_counts_data])
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, sep="\t")
tRF_counts_data = pd.read_table(input.tRF_counts_table, index_col="gene")
pisimitRF_counts_data = pd.concat([counts_data, tRF_counts_data])
pisimitRF_counts_data.index.names = ["gene"]
pisimitRF_counts_data.to_csv(output.counts_table_plus_tRF, sep="\t")
@wc_applied
...
...
@@ -1974,7 +1987,10 @@ def source_small_RNA_counts(wildcards):
# return rules.join_pisimi_counts.output.counts_table
if wildcards.small_type == f"pimi{SI_MIN}G":
# si_22G and also pi and mi
return rules.join_pimi22G_counts.output.counts_table
return rules.join_pimi22GtRF_counts.output.counts_table
elif wildcards.small_type == f"pimi{SI_MIN}GtRF":
# si_22G and also pi and mi and tRF
return rules.join_pimi22GtRF_counts.output.counts_table_plus_tRF
elif wildcards.small_type in {f"sisiu_{suffix}" for suffix in SI_SUFFIXES}:
# si and siu
return rules.join_sisiu_counts.output.counts_table
...
...
@@ -2361,21 +2377,19 @@ rule compute_RPM_folds:
assert lfc.index.name == "gene", f"Wrong index: {lfc.index.name}"
logfile.write(f"Adding small read type info from {input.tags_table}\n")
#pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
tags_table
= add_tags_column(lfc, input.tags_table, "small_type", logfile)
logfile.write(f"Columns in
tags_table are: {tags_table
.columns}\n")
logfile.write(f"Index in
tags_table is: {tags_table
.index.name}\n")
lfc_with_tags
= add_tags_column(lfc, input.tags_table, "small_type", logfile)
logfile.write(f"Columns in
lfc_with_tags are: {lfc_with_tags
.columns}\n")
logfile.write(f"Index in
lfc_with_tags is: {lfc_with_tags
.index.name}\n")
lfc_idx = lfc.index
RPM_idx = RPM.index
tags_table_idx = tags_table
.index
lfc_with_tags_idx = lfc_with_tags
.index
lfc_xor_RPM = lfc_idx.symmetric_difference(RPM_idx)
lfc_xor_tags_table = lfc_idx.symmetric_difference(tags_table_idx)
RPM_xor_tags_table = RPM_idx.symmetric_difference(tags_table_idx)
logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_tags_table: {lfc_xor_tags_table}\nRPM_xor_tags_table: {RPM_xor_tags_table}\n")
# with_tags = pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1)
with_tags = pd.concat((RPM, tags_table), axis=1)
lfc_xor_lfc_with_tags = lfc_idx.symmetric_difference(lfc_with_tags_idx)
RPM_xor_lfc_with_tags = RPM_idx.symmetric_difference(lfc_with_tags_idx)
logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_lfc_with_tags: {lfc_xor_lfc_with_tags}\nRPM_xor_lfc_with_tags: {RPM_xor_lfc_with_tags}\n")
with_tags = pd.concat((RPM, lfc_with_tags), axis=1)
logfile.write(f"Columns in with_tags are: {with_tags.columns}\n")
logfile.write(f"Index in with_tags is: {with_tags.index.name}\n")
# pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
logfile.write(f"Then writing to {output.fold_results}\n")
with_tags.to_csv(output.fold_results, sep="\t")
...
...
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
sign in
to comment