Commit ad17068e authored by David  BIKARD's avatar David BIKARD

Delete EcoWG1_library_design.ipynb

parent 761f9b8b
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "JISrOc0mt35M"
},
"source": [
"## EcoWG1 library design"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 436
},
"colab_type": "code",
"id": "wvewZLayQHpN",
"outputId": "66a35634-3629-4fff-f3e9-f5e0de0decf4"
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"from Bio import SeqIO\n",
"%matplotlib inline\n",
"from tqdm import tqdm\n",
"\n",
"data_path=\"data/\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"records=list(SeqIO.parse(data_path+\"GCF_000005845.2_ASM584v2_genomic.gbff\",\"genbank\"))\n",
"#records=list(SeqIO.parse(data_path+\"NC_000913.2.gb\",\"genbank\"))\n",
"\n",
"nb_bases_left=20\n",
"nb_bases_right=20\n",
"def generate_all_targets_df(records):\n",
" '''Generates a dataframe of all the target positions in the contigs provided in records. \n",
" The function currently exludes targets 20 bases from each sides of the contigs to enable the computation \n",
" of a score for dCas9.'''\n",
" \n",
" targets=[] #creates a list targets\n",
" for nb,contig in enumerate(records): #enumerates() adds a row number corresponding to each iteration starting with 0\n",
" #nb is the contig number?\n",
"\n",
" i=21+nb_bases_left\n",
" while True:\n",
" p=contig.seq.find(\"GG\",start=i) #p is defined as position of a PAM\n",
" left=p-21-nb_bases_left\n",
" right=p+2+nb_bases_right\n",
" \n",
" if (p==-1)|(right>len(contig.seq)): break # loop stops if PAM is at the very end of the sequence so the gRNA would exceed the sequence\n",
" \n",
" seq=str(contig.seq[left:right])\n",
" guide=seq[nb_bases_left:nb_bases_left+20]\n",
" contig_nb=nb+1\n",
" strand=\"+\"\n",
" pos=(p-1)%len(contig.seq) #we want pos to be the position of the \"N\" in \"NGG\"\n",
" targets.append([guide,seq,contig_nb,strand,pos]) #adds to the list target the gRNA seq contig, the contig number, the strand and the position\n",
" i=p+1\n",
" \n",
" # Dealing with reverse complement \n",
" i=21+nb_bases_left\n",
" refseqrev=contig.seq.reverse_complement() \n",
" while True:\n",
" p=refseqrev.find(\"GG\",start=i)\n",
" left=p-21-nb_bases_left\n",
" right=p+2+nb_bases_right\n",
" \n",
" if p==-1|(right>len(contig.seq)): break\n",
"\n",
" seq=str(refseqrev[left:right])\n",
" guide=seq[nb_bases_left:nb_bases_left+20]\n",
" contig_nb=nb+1\n",
" strand=\"-\"\n",
" pos=(-p)%len(refseqrev) #we want pos to be the position of the \"N\" in \"NGG\"\n",
" targets.append([guide,seq,contig_nb,strand,pos])\n",
" i=p+1\n",
" \n",
" targets=pd.DataFrame(targets,columns=['guide','seq','contig','strand','pos']) #transforms the list targets in a dataframe\n",
" \n",
" \n",
" '''adds column 'gene' to the dataframe 'targets' providing the gene name (if present) or locus_tag and NaN if no gene name or locus tag is present '''\n",
" \n",
" targets['gene']='NaN' #defines new column gene in dataframe target that contains for all rows NaN, in case a gene name of locus tag is present NaN is overwritten\n",
" targets['gene_ori']=None\n",
" targets['targets_coding_strand']=None\n",
" targets[\"second_half_gene\"]=False\n",
" \n",
" for nb,contig in enumerate(records): #for all contigs in records // nb,contig means all contig numbers???\n",
" features=[feat for feat in contig.features if feat.type=='gene']\n",
" for feature in tqdm(features):\n",
" start=feature.location.start\n",
" end=feature.location.end\n",
" L=end-start\n",
" gene_name=feature.qualifiers['gene'][0] if 'gene' in feature.qualifiers else feature.qualifiers['locus_tag'][0]\n",
" in_gene=(targets.pos>start)&(targets.pos<end-20)\n",
" gene_df=targets.loc[in_gene].copy()\n",
" gene_df[[\"gene\",\"gene_ori\"]] = gene_name, feature.strand\n",
" if feature.strand==1:\n",
" gene_df.loc[(gene_df.pos>start+int(L/2))&(gene_df.pos<end-20),\"second_half_gene\"]=True\n",
" gene_df.loc[(gene_df.strand==\"+\"),'targets_coding_strand']=False\n",
" gene_df.loc[(gene_df.strand==\"-\"),'targets_coding_strand']=True\n",
" elif feature.strand==-1:\n",
" gene_df.loc[(gene_df.pos>start)&(gene_df.pos<end-int(L/2)),\"second_half_gene\"]=True\n",
" gene_df.loc[(gene_df.strand==\"+\"),'targets_coding_strand']=True\n",
" gene_df.loc[(gene_df.strand==\"-\"),'targets_coding_strand']=False\n",
" targets.loc[in_gene]=gene_df\n",
" \n",
" return targets\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████████████████████████████████████████████████████████████████████████| 4566/4566 [11:45<00:00, 6.47it/s]\n"
]
}
],
"source": [
"targets=generate_all_targets_df(records)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Adding the model preditions to the dataframe"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"\n",
"bases=[\"A\",\"T\",\"G\",\"C\"]\n",
"def encode(seq):\n",
" return np.array([[int(b==p) for b in seq] for p in bases])\n",
"\n",
"def encode_seqarr(seq,r):\n",
" '''One hot encoding of the sequence. r specifies the position range.'''\n",
" X = np.array(\n",
" [encode(''.join([s[i] for i in r])) for s in seq]\n",
" )\n",
" X = X.reshape(X.shape[0], -1)\n",
" return X \n",
"\n",
"Xlin = encode_seqarr(targets[\"seq\"],list(range(34,41))+list(range(43,59)))\n",
"\n",
"with open(\"reg_coef.pkl\",\"rb\") as handle:\n",
" reg_coef=pickle.load(handle)\n",
"\n",
" \n",
"def predict(X):\n",
" return np.sum(X*reg_coef,axis=1)\n",
"\n",
"y_pred= predict(Xlin)\n",
"targets['score']=y_pred"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>guide</th>\n",
" <th>seq</th>\n",
" <th>contig</th>\n",
" <th>strand</th>\n",
" <th>pos</th>\n",
" <th>gene</th>\n",
" <th>gene_ori</th>\n",
" <th>targets_coding_strand</th>\n",
" <th>second_half_gene</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ACGGGCAATATGTCTCTGTG</td>\n",
" <td>AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATT...</td>\n",
" <td>1</td>\n",
" <td>+</td>\n",
" <td>40</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>-0.565078</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TCTGATAGCAGCTTCTGAAC</td>\n",
" <td>TGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTA...</td>\n",
" <td>1</td>\n",
" <td>+</td>\n",
" <td>78</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>0.194482</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AATTAAAATTTTATTGACTT</td>\n",
" <td>CTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCA...</td>\n",
" <td>1</td>\n",
" <td>+</td>\n",
" <td>117</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>0.383340</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CTAAATACTTTAACCAATAT</td>\n",
" <td>AATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCAT...</td>\n",
" <td>1</td>\n",
" <td>+</td>\n",
" <td>143</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>0.331188</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ACCACCATCACCATTACCAC</td>\n",
" <td>GCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAA...</td>\n",
" <td>1</td>\n",
" <td>+</td>\n",
" <td>236</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>-0.182206</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" guide seq \\\n",
"0 ACGGGCAATATGTCTCTGTG AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATT... \n",
"1 TCTGATAGCAGCTTCTGAAC TGTGGATTAAAAAAAGAGTGTCTGATAGCAGCTTCTGAACTGGTTA... \n",
"2 AATTAAAATTTTATTGACTT CTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCA... \n",
"3 CTAAATACTTTAACCAATAT AATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATAGGCAT... \n",
"4 ACCACCATCACCATTACCAC GCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAA... \n",
"\n",
" contig strand pos gene gene_ori targets_coding_strand second_half_gene \\\n",
"0 1 + 40 NaN None None False \n",
"1 1 + 78 NaN None None False \n",
"2 1 + 117 NaN None None False \n",
"3 1 + 143 NaN None None False \n",
"4 1 + 236 NaN None None False \n",
"\n",
" score \n",
"0 -0.565078 \n",
"1 0.194482 \n",
"2 0.383340 \n",
"3 0.331188 \n",
"4 -0.182206 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"targets.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Insert noffs calculation here\n",
"\n",
"def save_obj(obj, name ):\n",
" with open(data_path+'obj/'+ name + '.pkl', 'wb+') as f:\n",
" pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)\n",
"\n",
"def load_obj(name ):\n",
" with open(data_path+'obj/' + name + '.pkl', 'rb') as f:\n",
" return pickle.load(f)\n",
"\n",
"dataseed=pd.read_excel(data_path+\"5ntseeds.xlsx\")\n",
"dataseed=dataseed.sort_values(\"mean_E18\")\n",
"badseeds=dataseed.loc[:30,\"seeds\"].values\n",
"\n",
"dico_plus = load_obj(\"dico_plus\")\n",
"dico_moins = load_obj(\"dico_moins\")\n",
"dico_prom_plus = load_obj(\"dico_prom_plus\")\n",
"dico_prom_moins = load_obj(\"dico_prom_moins\")\n",
"dico_gene_right = load_obj(\"dico_gene_right\")\n",
"dico_gene_wrong = load_obj(\"dico_gene_wrong\")\n",
"\n",
"def minus_list(x,y):\n",
" return [item for item in x if item not in y]\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from time import time\n",
"from IPython import display\n",
"def off_targetsFixed(guides=[],dico_plus=[],dico_moins=[],dico_prom_plus=[],dico_prom_moins=[],dico_gene_right=[],dico_gene_wrong=[]):\n",
" off_12_plus=[0]*len(guides) \n",
" off_12_moins=[0]*len(guides) \n",
" off_9_prom_plus=[0]*len(guides) \n",
" off_9_prom_moins=[0]*len(guides)\n",
" off_10_gene_right=[0]*len(guides) \n",
" off_10_gene_wrong=[0]*len(guides)\n",
" off_11_gene_right=[0]*len(guides) \n",
" off_11_gene_wrong=[0]*len(guides)\n",
" \n",
" off_9_prom=[0]*len(guides)\n",
" off_10_gene=[0]*len(guides) \n",
" off_11_gene=[0]*len(guides) \n",
" off_12=[0]*len(guides) \n",
" \n",
" inbadseeds=[0]*len(guides)\n",
" ntargets=[0]*len(guides)\n",
"\n",
"# line=pd.DataFrame(columns=columns)\n",
" start=time()\n",
" for k,guide in enumerate(guides):\n",
"# start=time()\n",
" if (k+1) % 1000 == 0 :\n",
" display.clear_output(wait=True)\n",
" print(\"guide \"+str(k+1)+ \" time=\"+str(time()-start))\n",
"# print('test if',time()-start)\n",
"\n",
"# print('...')\n",
"# if k==10 :\n",
"# print(time()-start)\n",
"# break\n",
"# break\n",
"# start=time()\n",
"# print('test dataframe creation',time()-start)\n",
"# start=time()\n",
" #break\n",
" inbadseeds[k]=guide[-5:] in badseeds\n",
" ntargets[k]=len(dico_plus[20][guide]+dico_moins[20][guide])\n",
"# print('test fill badseed and ntargets',time()-start)\n",
"# break\n",
"# start=time()\n",
" i=12\n",
" off_12_plus[k]=minus_list(dico_plus[i][guide[-i:]],dico_plus[20][guide])\n",
" off_12_moins[k]=minus_list(dico_moins[i][guide[-i:]],dico_moins[20][guide])\n",
" off_12[k]=len(minus_list(dico_plus[i][guide[-i:]],dico_plus[20][guide])+minus_list(dico_moins[i][guide[-i:]],dico_moins[20][guide]))\n",
" i=9\n",
" off_9_prom_plus[k]=minus_list(dico_prom_plus[i][guide[-i:]],dico_prom_plus[20][guide])\n",
" off_9_prom_moins[k]=minus_list(dico_prom_moins[i][guide[-i:]],dico_prom_moins[20][guide])\n",
" off_9_prom[k]=len(minus_list(dico_prom_plus[i][guide[-i:]],dico_prom_plus[20][guide])+minus_list(dico_prom_moins[i][guide[-i:]],dico_prom_moins[20][guide]))\n",
" i=10\n",
" off_10_gene_right[k]=minus_list(dico_gene_right[i][guide[-i:]],dico_gene_right[20][guide])\n",
" off_10_gene_wrong[k]=minus_list(dico_gene_wrong[i][guide[-i:]],dico_gene_wrong[20][guide])\n",
" off_10_gene[k]=len(minus_list(dico_gene_right[i][guide[-i:]],dico_gene_right[20][guide]))\n",
"\n",
" i=11\n",
" off_11_gene_right[k]=minus_list(dico_gene_right[i][guide[-i:]],dico_gene_right[20][guide])\n",
" off_11_gene_wrong[k]=minus_list(dico_gene_wrong[i][guide[-i:]],dico_gene_wrong[20][guide])\n",
" off_11_gene[k]=len(minus_list(dico_gene_right[i][guide[-i:]],dico_gene_right[20][guide]))\n",
"# print(time()-start)\n",
"# res=pd.concat([res,line])\n",
"# res[res.guides==guide]\n",
"# print(time()-start)\n",
"# if k==1000 :\n",
"# print(time()-start)\n",
"# break\n",
" cols=['guide', 'inbadseeds','ntargets','off_12','off_12_plus','off_12_moins','off_9_prom','off_9_prom_plus','off_9_prom_moins','off_10_gene','off_10_gene_right','off_10_gene_wrong','off_11_gene','off_11_gene_right','off_11_gene_wrong'] \n",
" res=pd.DataFrame(columns=cols)\n",
" res['guide']=guides\n",
" res['inbadseeds']=inbadseeds\n",
" res['ntargets']=ntargets\n",
" \n",
" res['off_12_plus']=off_12_plus\n",
" res['off_12_moins']=off_12_moins\n",
" res['off_12']=off_12\n",
" \n",
" res['off_9_prom_plus']=off_9_prom_plus\n",
" res['off_9_prom_moins']=off_9_prom_moins\n",
" res['off_9_prom']=off_9_prom\n",
" \n",
" res['off_10_gene_right']=off_10_gene_right\n",
" res['off_10_gene_wrong']=off_10_gene_wrong\n",
" res['off_10_gene']=off_10_gene\n",
" \n",
" res['off_11_gene_right']=off_11_gene_right\n",
" res['off_11_gene_wrong']=off_11_gene_wrong\n",
" res['off_11_gene']=off_11_gene\n",
" \n",
" return(res)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"guide 542000 time=25.489531993865967\n"
]
}
],
"source": [
"targets.set_index(targets.guide,inplace=True)\n",
"df_OffTargets=off_targetsFixed(targets.index,dico_plus,dico_moins,dico_prom_plus,dico_prom_moins,dico_gene_right,dico_gene_wrong)\n",
"df_OffTargets.set_index(df_OffTargets.guide,inplace=True)\n",
"df_OffTargets=df_OffTargets.drop(\"guide\",axis=1)\n",
"targets=targets.join(df_OffTargets)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"q3=np.percentile(targets[\"score\"].values,25)\n",
"q2=np.percentile(targets[\"score\"].values,50)\n",
"q1=np.percentile(targets[\"score\"].values,75)\n",
"\n",
"score_quartile=np.ones(targets.shape[0], dtype=int)\n",
"score_quartile[np.where(targets[\"score\"].values < q1)]=2\n",
"score_quartile[np.where(targets[\"score\"].values < q2)]=3\n",
"score_quartile[np.where(targets[\"score\"].values < q3)]=4\n",
"targets[\"score_quartile\"]=score_quartile"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0.3993370889655648, -0.08422371663696479, -0.5913318628945142)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q1,q2,q3"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"#targets.to_csv(data_path+\"all_targets.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#targets=pd.read_csv(data_path+\"all_targets.csv\",index_col=0)\n",
"#targets=targets.rename({\"guide.1\":\"guide\"},axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(243260, 13)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#keep only guides targeting genes and the useful columns\n",
"targets_gene=targets[targets.targets_coding_strand.notnull() & targets.targets_coding_strand]\n",
"targets_gene=targets_gene[[\"gene\",\"guide\",\"strand\",\"pos\",\"ntargets\",\"off_12\",\"off_10_gene\",\"off_11_gene\",\"off_9_prom\",\"inbadseeds\",\"score_quartile\", \"second_half_gene\", \"score\"]]\n",
"targets_gene.shape"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"#sort all the guides and rank them\n",
"targets_gene=targets_gene.sort_values([\"gene\",\"ntargets\",\"off_12\",\"off_11_gene\",\"off_9_prom\",\"inbadseeds\",\"score_quartile\", \"second_half_gene\", \"score\"], \n",
" ascending=[False, True, True, True, True, True, True, True, False])\n",
"\n",
"rank=0\n",
"g=None\n",
"gene_ranks=[]\n",
"for i,row in targets_gene.iterrows():\n",
" if g==row.gene:\n",
" rank+=1\n",
" else:\n",
" g=row.gene\n",
" rank=1\n",
" \n",
" gene_ranks.append(rank)\n",
"\n",
"targets_gene[\"gene_rank\"]=gene_ranks"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"#The final library is the top 5 guides for each gene\n",
"library=targets_gene[targets_gene.gene_rank<6]"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "EcoWG1.ipynb",
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment