Commit 0c66ad71 authored by Kenzo-Hugo Hillion's avatar Kenzo-Hugo Hillion
Browse files

update notebook and settings

parent 59aeb54a
......@@ -2,3 +2,6 @@ __pycache__/
*.egg-info/
.env
.idea/
# For settings configuration
web/metagenedb/settings/__init__.py
%% Cell type:markdown id: tags:
## Proof of concept
The idea here is to perform loading of all the genes in the annotation file from IGC
%% Cell type:code id: tags:
``` python
import time
# Timer decorator
def timer(function):
def wrapper(*args,**kwargs):
start_time = time.time()
func = function(*args,**kwargs)
print("\n[{}] --> EXECUTED TIME: {} seconds".format(function.__name__, time.time() - start_time))
return func
return wrapper
```
%% Cell type:markdown id: tags:
### Input files
%% Cell type:code id: tags:
``` python
IGC_annotation_path = "/grouin/DBs/catalogs/AnnotationInfo/IGC.annotation_OF.summary"
IGC_annotation_path = "/home/khillion/Pasteur/gitlab/metagenedb/dev_data/IGC_sample.annotation_OF.summary"
```
%% Cell type:code id: tags:
``` python
from itertools import islice
from metagenedb.apps.catalog.models import Gene
def read_in_chunks(file_object, chunk_size=500000):
"""Lazy function (generator) to read a file piece by piece.
Default chunk size: 1k."""
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
def format_gene(raw_line):
infos = raw_line.rstrip().split()
gene = Gene(gene_id=infos[1],
gene_length=infos[2],
taxonomic_genus=infos[6],
taxonomic_phylum=infos[5])
return gene
def insert_gene(raw_line):
gene_info = raw_line.rstrip().split('\t')
gene = Gene(gene_id=gene_info[1],
gene_length=gene_info[2],
taxonomic_genus=gene_info[6],
taxonomic_phylum=gene_info[5])
gene.full_clean()
gene.save()
def format_gene_list(chunk_genes):
genes = []
def insert_gene_list(chunk_genes):
for i in chunk_genes:
genes.append(format_gene(i))
return genes
insert_gene(i)
@timer
def load_annotation_file_to_db_in_chunks(annot_file, chunk_size=100000):
def load_annotation_file_to_db_in_chunks(annotation_file, chunk_size=100000):
loaded_genes = 0
with open(annot_file, 'r') as file:
with open(annotation_file, 'r') as file:
while True:
chunk_genes = list(islice(file, chunk_size))
if not chunk_genes:
break
loaded_genes += len(chunk_genes)
genes = format_gene_list(chunk_genes)
Gene.objects.bulk_create(genes)
# genes = format_gene_list(chunk_genes)
insert_gene_list(chunk_genes)
# Gene.objects.bulk_create(genes)
print(f"{loaded_genes} genes loaded so far...")
break
```
%% Cell type:code id: tags:
``` python
Gene.objects.all().delete()
# load_annotation_file_to_db_in_chunks(IGC_annotation_path)
```
%% Output
(0, {'catalog.Gene': 0})
%% Cell type:code id: tags:
``` python
load_annotation_file_to_db_in_chunks(IGC_annotation_path)
```
%% Output
100000 genes loaded so far...
200000 genes loaded so far...
300000 genes loaded so far...
400000 genes loaded so far...
500000 genes loaded so far...
600000 genes loaded so far...
700000 genes loaded so far...
800000 genes loaded so far...
900000 genes loaded so far...
1000000 genes loaded so far...
1100000 genes loaded so far...
1200000 genes loaded so far...
1300000 genes loaded so far...
1400000 genes loaded so far...
1500000 genes loaded so far...
1600000 genes loaded so far...
1700000 genes loaded so far...
1800000 genes loaded so far...
1900000 genes loaded so far...
2000000 genes loaded so far...
2100000 genes loaded so far...
2200000 genes loaded so far...
2300000 genes loaded so far...
2400000 genes loaded so far...
2500000 genes loaded so far...
2600000 genes loaded so far...
2700000 genes loaded so far...
2800000 genes loaded so far...
2900000 genes loaded so far...
3000000 genes loaded so far...
3100000 genes loaded so far...
3200000 genes loaded so far...
3300000 genes loaded so far...
3400000 genes loaded so far...
3500000 genes loaded so far...
3600000 genes loaded so far...
3700000 genes loaded so far...
3800000 genes loaded so far...
3900000 genes loaded so far...
4000000 genes loaded so far...
4100000 genes loaded so far...
4200000 genes loaded so far...
4300000 genes loaded so far...
4400000 genes loaded so far...
4500000 genes loaded so far...
4600000 genes loaded so far...
4700000 genes loaded so far...
4800000 genes loaded so far...
4900000 genes loaded so far...
5000000 genes loaded so far...
5100000 genes loaded so far...
5200000 genes loaded so far...
5300000 genes loaded so far...
5400000 genes loaded so far...
5500000 genes loaded so far...
5600000 genes loaded so far...
5700000 genes loaded so far...
5800000 genes loaded so far...
5900000 genes loaded so far...
6000000 genes loaded so far...
6100000 genes loaded so far...
6200000 genes loaded so far...
6300000 genes loaded so far...
6400000 genes loaded so far...
6500000 genes loaded so far...
6600000 genes loaded so far...
6700000 genes loaded so far...
6800000 genes loaded so far...
6900000 genes loaded so far...
7000000 genes loaded so far...
7100000 genes loaded so far...
7200000 genes loaded so far...
7300000 genes loaded so far...
7400000 genes loaded so far...
7500000 genes loaded so far...
7600000 genes loaded so far...
7700000 genes loaded so far...
7800000 genes loaded so far...
7900000 genes loaded so far...
8000000 genes loaded so far...
8100000 genes loaded so far...
8200000 genes loaded so far...
8300000 genes loaded so far...
8400000 genes loaded so far...
8500000 genes loaded so far...
8600000 genes loaded so far...
8700000 genes loaded so far...
8800000 genes loaded so far...
8900000 genes loaded so far...
9000000 genes loaded so far...
9100000 genes loaded so far...
9200000 genes loaded so far...
9300000 genes loaded so far...
9400000 genes loaded so far...
9500000 genes loaded so far...
9600000 genes loaded so far...
9700000 genes loaded so far...
9800000 genes loaded so far...
9879896 genes loaded so far...
[load_annotation_file_to_db_in_chunks] --> EXECUTED TIME: 1369.5733745098114 seconds
1000 genes loaded so far...
%% Cell type:code id: tags:
``` python
import metagenedb.settings
```
%% Cell type:code id: tags:
``` python
metagenedb.settings.MIDDLEWARE
```
%% Output
['django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'corsheaders.middleware.CorsMiddleware']
%% Cell type:code id: tags:
``` python
metagenedb.settings.INSTALLED_APPS
```
%% Output
['metagenedb.apps.catalog',
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'rest_framework',
'django_extensions',
'corsheaders']
%% Cell type:code id: tags:
``` python
```
......