Commit e4ed8a62 authored by David  BIKARD's avatar David BIKARD

Initial commit

parent 90b4d9c7
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "JISrOc0mt35M"
},
"source": [
"## EcoWG1 library design"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 436
},
"colab_type": "code",
"id": "wvewZLayQHpN",
"outputId": "66a35634-3629-4fff-f3e9-f5e0de0decf4"
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"from Bio import SeqIO\n",
"%matplotlib inline\n",
"from tqdm import tqdm\n",
"\n",
"data_path=\"data/\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"records=list(SeqIO.parse(data_path+\"GCF_000005845.2_ASM584v2_genomic.gbff\",\"genbank\"))\n",
"#records=list(SeqIO.parse(data_path+\"NC_000913.2.gb\",\"genbank\"))\n",
"\n",
"nb_bases_left=20\n",
"nb_bases_right=20\n",
"def generate_all_targets_df(records):\n",
" '''Generates a dataframe of all the target positions in the contigs provided in records. \n",
" The function currently exludes targets 20 bases from each sides of the contigs to enable the computation \n",
" of a score for dCas9.'''\n",
" \n",
" targets=[] #creates a list targets\n",
" for nb,contig in enumerate(records): #enumerates() adds a row number corresponding to each iteration starting with 0\n",
" #nb is the contig number?\n",
"\n",
" i=21+nb_bases_left\n",
" while True:\n",
" p=contig.seq.find(\"GG\",start=i) #p is defined as position of a PAM\n",
" left=p-21-nb_bases_left\n",
" right=p+2+nb_bases_right\n",
" \n",
" if (p==-1)|(right>len(contig.seq)): break # loop stops if PAM is at the very end of the sequence so the gRNA would exceed the sequence\n",
" \n",
" seq=str(contig.seq[left:right])\n",
" guide=seq[nb_bases_left:nb_bases_left+20]\n",
" contig_nb=nb+1\n",
" strand=\"+\"\n",
" pos=(p-1)%len(contig.seq) #we want pos to be the position of the \"N\" in \"NGG\"\n",
" targets.append([guide,seq,contig_nb,strand,pos]) #adds to the list target the gRNA seq contig, the contig number, the strand and the position\n",
" i=p+1\n",
" \n",
" # Dealing with reverse complement \n",
" i=21+nb_bases_left\n",
" refseqrev=contig.seq.reverse_complement() \n",
" while True:\n",
" p=refseqrev.find(\"GG\",start=i)\n",
" left=p-21-nb_bases_left\n",
" right=p+2+nb_bases_right\n",
" \n",
" if p==-1|(right>len(contig.seq)): break\n",
"\n",
" seq=str(refseqrev[left:right])\n",
" guide=seq[nb_bases_left:nb_bases_left+20]\n",
" contig_nb=nb+1\n",
" strand=\"-\"\n",
" pos=(-p)%len(refseqrev) #we want pos to be the position of the \"N\" in \"NGG\"\n",
" targets.append([guide,seq,contig_nb,strand,pos])\n",
" i=p+1\n",
" \n",
" targets=pd.DataFrame(targets,columns=['guide','seq','contig','strand','pos']) #transforms the list targets in a dataframe\n",
" \n",
" \n",
" '''adds column 'gene' to the dataframe 'targets' providing the gene name (if present) or locus_tag and NaN if no gene name or locus tag is present '''\n",
" \n",
" targets['gene']='NaN' #defines new column gene in dataframe target that contains for all rows NaN, in case a gene name of locus tag is present NaN is overwritten\n",
" targets['gene_ori']=None\n",
" targets['targets_coding_strand']=None\n",
" targets[\"second_half_gene\"]=False\n",
" \n",
" for nb,contig in enumerate(records): #for all contigs in records // nb,contig means all contig numbers???\n",
" features=[feat for feat in contig.features if feat.type=='gene']\n",
" for feature in tqdm(features):\n",
" start=feature.location.start\n",
" end=feature.location.end\n",
" L=end-start\n",
" gene_name=feature.qualifiers['gene'][0] if 'gene' in feature.qualifiers else feature.qualifiers['locus_tag'][0]\n",
" in_gene=(targets.pos>start)&(targets.pos<end-20)\n",
" gene_df=targets.loc[in_gene].copy()\n",
" gene_df[[\"gene\",\"gene_ori\"]] = gene_name, feature.strand\n",
" if feature.strand==1:\n",
" gene_df.loc[(gene_df.pos>start+int(L/2))&(gene_df.pos<end-20),\"second_half_gene\"]=True\n",
" gene_df.loc[(gene_df.strand==\"+\"),'targets_coding_strand']=False\n",
" gene_df.loc[(gene_df.strand==\"-\"),'targets_coding_strand']=True\n",
" elif feature.strand==-1:\n",
" gene_df.loc[(gene_df.pos>start)&(gene_df.pos<end-int(L/2)),\"second_half_gene\"]=True\n",
" gene_df.loc[(gene_df.strand==\"+\"),'targets_coding_strand']=True\n",
" gene_df.loc[(gene_df.strand==\"-\"),'targets_coding_strand']=False\n",
" targets.loc[in_gene]=gene_df\n",
" \n",
" return targets\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████████████████████████████████████████████████████████████████████████| 4566/4566 [11:45<00:00, 6.47it/s]\n"
]
}
],
"source": [
"targets=generate_all_targets_df(records)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Adding the model preditions to the dataframe"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"\n",
"bases=[\"A\",\"T\",\"G\",\"C\"]\n",
"def encode(seq):\n",
" return np.array([[int(b==p) for b in seq] for p in bases])\n",
"\n",
"def encode_seqarr(seq,r):\n",
" '''One hot encoding of the sequence. r specifies the position range.'''\n",
" X = np.array(\n",
" [encode(''.join([s[i] for i in r])) for s in seq]\n",
" )\n",
" X = X.reshape(X.shape[0], -1)\n",
" return X \n",
"\n",
"Xlin = encode_seqarr(targets[\"seq\"],list(range(34,41))+list(range(43,59)))\n",
"\n",
"with open(\"reg_coef.pkl\",\"rb\") as handle:\n",
" reg_coef=pickle.load(handle)\n",
"\n",
" \n",
"def predict(X):\n",
" return np.sum(X*reg_coef,axis=1)\n",
"\n",
"y_pred= predict(Xlin)\n",
"targets['score']=y_pred"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>guide</th>\n",
" <th>seq</th>\n",
" <th>contig</th>\n",
" <th>strand</th>\n",
" <th>pos</th>\n",
" <th>gene</th>\n",
" <th>gene_ori</th>\n",
" <th>targets_coding_strand</th>\n",
" <th>second_half_gene</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ACGGGCAATATGTCTCTGTG</td>\n",
" <td>AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATT...</td>\n",
" <td>1</td>\n",
" <td>+</td>\n",
" <td>40</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>-0.565078</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TCTGATAGCAGCTTCTGAAC</td>\n",
" <td>TGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTA...</td>\n",
" <td>1</td>\n",
" <td>+</td>\n",
" <td>78</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>0.194482</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AATTAAAATTTTATTGACTT</td>\n",
" <td>CTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCA...</td>\n",
" <td>1</td>\n",
" <td>+</td>\n",
" <td>117</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>0.383340</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CTAAATACTTTAACCAATAT</td>\n",
" <td>AATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCAT...</td>\n",
" <td>1</td>\n",
" <td>+</td>\n",
" <td>143</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>0.331188</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ACCACCATCACCATTACCAC</td>\n",
" <td>GCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAA...</td>\n",
" <td>1</td>\n",
" <td>+</td>\n",
" <td>236</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>-0.182206</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" guide seq \\\n",
"0 ACGGGCAATATGTCTCTGTG AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATT... \n",
"1 TCTGATAGCAGCTTCTGAAC TGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTA... \n",
"2 AATTAAAATTTTATTGACTT CTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCA... \n",
"3 CTAAATACTTTAACCAATAT AATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCAT... \n",
"4 ACCACCATCACCATTACCAC GCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAA... \n",
"\n",
" contig strand pos gene gene_ori targets_coding_strand second_half_gene \\\n",
"0 1 + 40 NaN None None False \n",
"1 1 + 78 NaN None None False \n",
"2 1 + 117 NaN None None False \n",
"3 1 + 143 NaN None None False \n",
"4 1 + 236 NaN None None False \n",
"\n",
" score \n",
"0 -0.565078 \n",
"1 0.194482 \n",
"2 0.383340 \n",
"3 0.331188 \n",
"4 -0.182206 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"targets.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Insert noffs calculation here\n",
"\n",
"def save_obj(obj, name ):\n",
" with open(data_path+'obj/'+ name + '.pkl', 'wb+') as f:\n",
" pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)\n",
"\n",
"def load_obj(name ):\n",
" with open(data_path+'obj/' + name + '.pkl', 'rb') as f:\n",
" return pickle.load(f)\n",
"\n",
"dataseed=pd.read_excel(data_path+\"5ntseeds.xlsx\")\n",
"dataseed=dataseed.sort_values(\"mean_E18\")\n",
"badseeds=dataseed.loc[:30,\"seeds\"].values\n",
"\n",
"dico_plus = load_obj(\"dico_plus\")\n",
"dico_moins = load_obj(\"dico_moins\")\n",
"dico_prom_plus = load_obj(\"dico_prom_plus\")\n",
"dico_prom_moins = load_obj(\"dico_prom_moins\")\n",
"dico_gene_right = load_obj(\"dico_gene_right\")\n",
"dico_gene_wrong = load_obj(\"dico_gene_wrong\")\n",
"\n",
"def minus_list(x,y):\n",
" return [item for item in x if item not in y]\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from time import time\n",
"from IPython import display\n",
"def off_targetsFixed(guides=[],dico_plus=[],dico_moins=[],dico_prom_plus=[],dico_prom_moins=[],dico_gene_right=[],dico_gene_wrong=[]):\n",
" off_12_plus=[0]*len(guides) \n",
" off_12_moins=[0]*len(guides) \n",
" off_9_prom_plus=[0]*len(guides) \n",
" off_9_prom_moins=[0]*len(guides)\n",
" off_10_gene_right=[0]*len(guides) \n",
" off_10_gene_wrong=[0]*len(guides)\n",
" off_11_gene_right=[0]*len(guides) \n",
" off_11_gene_wrong=[0]*len(guides)\n",
" \n",
" off_9_prom=[0]*len(guides)\n",
" off_10_gene=[0]*len(guides) \n",
" off_11_gene=[0]*len(guides) \n",
" off_12=[0]*len(guides) \n",
" \n",
" inbadseeds=[0]*len(guides)\n",
" ntargets=[0]*len(guides)\n",
"\n",
"# line=pd.DataFrame(columns=columns)\n",
" start=time()\n",
" for k,guide in enumerate(guides):\n",
"# start=time()\n",
" if (k+1) % 1000 == 0 :\n",
" display.clear_output(wait=True)\n",
" print(\"guide \"+str(k+1)+ \" time=\"+str(time()-start))\n",
"# print('test if',time()-start)\n",
"\n",
"# print('...')\n",
"# if k==10 :\n",
"# print(time()-start)\n",
"# break\n",
"# break\n",
"# start=time()\n",
"# print('test dataframe creation',time()-start)\n",
"# start=time()\n",
" #break\n",
" inbadseeds[k]=guide[-5:] in badseeds\n",
" ntargets[k]=len(dico_plus[20][guide]+dico_moins[20][guide])\n",
"# print('test fill badseed and ntargets',time()-start)\n",
"# break\n",
"# start=time()\n",
" i=12\n",
" off_12_plus[k]=minus_list(dico_plus[i][guide[-i:]],dico_plus[20][guide])\n",
" off_12_moins[k]=minus_list(dico_moins[i][guide[-i:]],dico_moins[20][guide])\n",
" off_12[k]=len(minus_list(dico_plus[i][guide[-i:]],dico_plus[20][guide])+minus_list(dico_moins[i][guide[-i:]],dico_moins[20][guide]))\n",
" i=9\n",
" off_9_prom_plus[k]=minus_list(dico_prom_plus[i][guide[-i:]],dico_prom_plus[20][guide])\n",
" off_9_prom_moins[k]=minus_list(dico_prom_moins[i][guide[-i:]],dico_prom_moins[20][guide])\n",
" off_9_prom[k]=len(minus_list(dico_prom_plus[i][guide[-i:]],dico_prom_plus[20][guide])+minus_list(dico_prom_moins[i][guide[-i:]],dico_prom_moins[20][guide]))\n",
" i=10\n",
" off_10_gene_right[k]=minus_list(dico_gene_right[i][guide[-i:]],dico_gene_right[20][guide])\n",
" off_10_gene_wrong[k]=minus_list(dico_gene_wrong[i][guide[-i:]],dico_gene_wrong[20][guide])\n",
" off_10_gene[k]=len(minus_list(dico_gene_right[i][guide[-i:]],dico_gene_right[20][guide]))\n",
"\n",
" i=11\n",
" off_11_gene_right[k]=minus_list(dico_gene_right[i][guide[-i:]],dico_gene_right[20][guide])\n",
" off_11_gene_wrong[k]=minus_list(dico_gene_wrong[i][guide[-i:]],dico_gene_wrong[20][guide])\n",
" off_11_gene[k]=len(minus_list(dico_gene_right[i][guide[-i:]],dico_gene_right[20][guide]))\n",
"# print(time()-start)\n",
"# res=pd.concat([res,line])\n",
"# res[res.guides==guide]\n",
"# print(time()-start)\n",
"# if k==1000 :\n",
"# print(time()-start)\n",
"# break\n",
" cols=['guide', 'inbadseeds','ntargets','off_12','off_12_plus','off_12_moins','off_9_prom','off_9_prom_plus','off_9_prom_moins','off_10_gene','off_10_gene_right','off_10_gene_wrong','off_11_gene','off_11_gene_right','off_11_gene_wrong'] \n",
" res=pd.DataFrame(columns=cols)\n",
" res['guide']=guides\n",
" res['inbadseeds']=inbadseeds\n",
" res['ntargets']=ntargets\n",
" \n",
" res['off_12_plus']=off_12_plus\n",
" res['off_12_moins']=off_12_moins\n",
" res['off_12']=off_12\n",
" \n",
" res['off_9_prom_plus']=off_9_prom_plus\n",
" res['off_9_prom_moins']=off_9_prom_moins\n",
" res['off_9_prom']=off_9_prom\n",
" \n",
" res['off_10_gene_right']=off_10_gene_right\n",
" res['off_10_gene_wrong']=off_10_gene_wrong\n",
" res['off_10_gene']=off_10_gene\n",
" \n",
" res['off_11_gene_right']=off_11_gene_right\n",
" res['off_11_gene_wrong']=off_11_gene_wrong\n",
" res['off_11_gene']=off_11_gene\n",
" \n",
" return(res)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"guide 542000 time=25.489531993865967\n"
]
}
],
"source": [
"targets.set_index(targets.guide,inplace=True)\n",
"df_OffTargets=off_targetsFixed(targets.index,dico_plus,dico_moins,dico_prom_plus,dico_prom_moins,dico_gene_right,dico_gene_wrong)\n",
"df_OffTargets.set_index(df_OffTargets.guide,inplace=True)\n",
"df_OffTargets=df_OffTargets.drop(\"guide\",axis=1)\n",
"targets=targets.join(df_OffTargets)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"q3=np.percentile(targets[\"score\"].values,25)\n",
"q2=np.percentile(targets[\"score\"].values,50)\n",
"q1=np.percentile(targets[\"score\"].values,75)\n",
"\n",
"score_quartile=np.ones(targets.shape[0], dtype=int)\n",
"score_quartile[np.where(targets[\"score\"].values < q1)]=2\n",
"score_quartile[np.where(targets[\"score\"].values < q2)]=3\n",
"score_quartile[np.where(targets[\"score\"].values < q3)]=4\n",
"targets[\"score_quartile\"]=score_quartile"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.3993370889655648, -0.08422371663696479, -0.5913318628945142)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q1,q2,q3"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#targets.to_csv(data_path+\"all_targets.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#targets=pd.read_csv(data_path+\"all_targets.csv\",index_col=0)\n",
"#targets=targets.rename({\"guide.1\":\"guide\"},axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(243260, 13)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#keep only guides targeting genes and the useful columns\n",
"targets_gene=targets[targets.targets_coding_strand.notnull() & targets.targets_coding_strand]\n",
"targets_gene=targets_gene[[\"gene\",\"guide\",\"strand\",\"pos\",\"ntargets\",\"off_12\",\"off_10_gene\",\"off_11_gene\",\"off_9_prom\",\"inbadseeds\",\"score_quartile\", \"second_half_gene\", \"score\"]]\n",
"targets_gene.shape"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"#sort all the guides and rank them\n",
"targets_gene=targets_gene.sort_values([\"gene\",\"ntargets\",\"off_12\",\"off_11_gene\",\"off_9_prom\",\"inbadseeds\",\"score_quartile\", \"second_half_gene\", \"score\"], \n",
" ascending=[False, True, True, True, True, True, True, True, False])\n",
"\n",
"rank=0\n",
"g=None\n",
"gene_ranks=[]\n",
"for i,row in targets_gene.iterrows():\n",
" if g==row.gene:\n",
" rank+=1\n",
" else:\n",
" g=row.gene\n",
" rank=1\n",
" \n",
" gene_ranks.append(rank)\n",
"\n",
"targets_gene[\"gene_rank\"]=gene_ranks"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"#The final library is the top 5 guides for each gene\n",
"library=targets_gene[targets_gene.gene_rank<6]"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "EcoWG1.ipynb",
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment