diff --git a/README.md b/README.md index e57cf182787c21d21c3280a269a80e8ddfb30663..ac795c9da7dc1583f46dcd594c0a4f9da3595892 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,10 @@ change the following: happens inside the Multimer model. * The `preset` flag in `run_alphafold.py` and `run_docker.py` was split into `db_preset` and `model_preset`. +* The models to use are not specified using `model_names` but rather using the + `model_preset` flag. If you want to customize which models are used for each + preset, you will have to modify the the `MODEL_PRESETS` dictionary in + `alphafold/model/config.py`. * Setting the `data_dir` flag is now needed when using `run_docker.py`. @@ -320,18 +324,124 @@ All steps are the same as when running the monomer system, but you will have to whether all input sequences in the given fasta file are prokaryotic. If that is not the case or the origin is unknown, set to `false` for that fasta. -An example that folds two protein complexes `multimer1` and `multimer2` where -the first is prokaryotic and the second isn't: +An example that folds a protein complex `multimer.fasta` that is prokaryotic: ```bash python3 docker/run_docker.py \ - --fasta_paths=multimer1.fasta,multimer2.fasta \ - --is_prokaryote_list=true,false \ + --fasta_paths=multimer.fasta \ + --is_prokaryote_list=true \ --max_template_date=2020-05-14 \ --model_preset=multimer \ --data_dir=$DOWNLOAD_DIR ``` +### Examples + +Below are examples on how to use AlphaFold in different scenarios. + +#### Folding a monomer + +Say we have a monomer with the sequence `<SEQUENCE>`. The input fasta should be: + +```fasta +>sequence_name +<SEQUENCE> +``` + +Then run the following command: + +```bash +python3 docker/run_docker.py \ + --fasta_paths=monomer.fasta \ + --max_template_date=2021-11-01 \ + --model_preset=monomer \ + --data_dir=$DOWNLOAD_DIR +``` + +#### Folding a homomer + +Say we have a homomer from a prokaryote with 3 copies of the same sequence +`<SEQUENCE>`. The input fasta should be: + +```fasta +>sequence_1 +<SEQUENCE> +>sequence_2 +<SEQUENCE> +>sequence_3 +<SEQUENCE> +``` + +Then run the following command: + +```bash +python3 docker/run_docker.py \ + --fasta_paths=homomer.fasta \ + --is_prokaryote_list=true \ + --max_template_date=2021-11-01 \ + --model_preset=multimer \ + --data_dir=$DOWNLOAD_DIR +``` + +#### Folding a heteromer + +Say we have a heteromer A2B3 of unknown origin, i.e. with 2 copies of +`<SEQUENCE A>` and 3 copies of `<SEQUENCE B>`. The input fasta should be: + +```fasta +>sequence_1 +<SEQUENCE A> +>sequence_2 +<SEQUENCE A> +>sequence_3 +<SEQUENCE B> +>sequence_4 +<SEQUENCE B> +>sequence_5 +<SEQUENCE B> +``` + +Then run the following command: + +```bash +python3 docker/run_docker.py \ + --fasta_paths=heteromer.fasta \ + --is_prokaryote_list=false \ + --max_template_date=2021-11-01 \ + --model_preset=multimer \ + --data_dir=$DOWNLOAD_DIR +``` + +#### Folding multiple monomers one after another + +Say we have a two monomers, `monomer1.fasta` and `monomer2.fasta`. + +We can fold both sequentially by using the following command: + +```bash +python3 docker/run_docker.py \ + --fasta_paths=monomer1.fasta,monomer2.fasta \ + --max_template_date=2021-11-01 \ + --model_preset=monomer \ + --data_dir=$DOWNLOAD_DIR +``` + +#### Folding multiple multimers one after another + +Say we have a two multimers, `multimer1.fasta` and `multimer2.fasta`. Both are +from a prokaryotic organism. + +We can fold both sequentially by using the following command: + +```bash +python3 docker/run_docker.py \ + --fasta_paths=multimer1.fasta,multimer2.fasta \ + --is_prokaryote_list=true,true \ + --max_template_date=2021-11-01 \ + --model_preset=multimer \ + --data_dir=$DOWNLOAD_DIR +``` + ### AlphaFold output The outputs will be saved in a subdirectory of the directory provided via the diff --git a/alphafold/data/pipeline_multimer.py b/alphafold/data/pipeline_multimer.py index 75bc1a52a84007ad87284735eb8ed444ff69bdbc..46fa7a9ad1522e852560cfe41b5e85f506b819ba 100644 --- a/alphafold/data/pipeline_multimer.py +++ b/alphafold/data/pipeline_multimer.py @@ -202,7 +202,7 @@ class DataPipeline: msa_output_dir: str, is_homomer_or_monomer: bool) -> pipeline.FeatureDict: """Runs the monomer pipeline on a single chain.""" - chain_fasta_str = f'>{description}\n{sequence}\n' + chain_fasta_str = f'>chain_{chain_id}\n{sequence}\n' chain_msa_output_dir = os.path.join(msa_output_dir, chain_id) if not os.path.exists(chain_msa_output_dir): os.makedirs(chain_msa_output_dir) diff --git a/docker/run_docker.py b/docker/run_docker.py index 4eec39c9e1f55c1569d50439daa1ee113b2e95ff..5d0f9beb0990a161736f2a66b821b4b44d80acd7 100644 --- a/docker/run_docker.py +++ b/docker/run_docker.py @@ -32,17 +32,17 @@ flags.DEFINE_string( 'gpu_devices', 'all', 'Comma separated list of devices to pass to NVIDIA_VISIBLE_DEVICES.') flags.DEFINE_list( - 'fasta_paths', None, - 'Paths to FASTA files, each containing one sequence. Paths should be ' + 'fasta_paths', None, 'Paths to FASTA files, each containing a prediction ' + 'target that will be folded one after another. If a FASTA file contains ' + 'multiple sequences, then it will be folded as a multimer. Paths should be ' 'separated by commas. All FASTA paths must have a unique basename as the ' 'basename is used to name the output directories for each prediction.') -flags.DEFINE_list('is_prokaryote_list', None, 'Optional for multimer system, ' - 'not used by the single chain system. ' - 'This list should contain a boolean for each fasta ' - 'specifying true where the target complex is from a ' - 'prokaryote, and false where it is not, or where the ' - 'origin is unknown. These values determine the pairing ' - 'method for the MSA.') +flags.DEFINE_list( + 'is_prokaryote_list', None, 'Optional for multimer system, not used by the ' + 'single chain system. This list should contain a boolean for each fasta ' + 'specifying true where the target complex is from a prokaryote, and false ' + 'where it is not, or where the origin is unknown. These values determine ' + 'the pairing method for the MSA.') flags.DEFINE_string( 'output_dir', '/tmp/alphafold', 'Path to a directory that will store the results.') diff --git a/notebooks/AlphaFold.ipynb b/notebooks/AlphaFold.ipynb index 9e20dee003a1e98e51c994a1128ffe9a6864508d..d8c475d80ecfb23d64d4ef1700fac4f2925155d8 100644 --- a/notebooks/AlphaFold.ipynb +++ b/notebooks/AlphaFold.ipynb @@ -648,8 +648,9 @@ " total_num_res = best_unrelaxed_prot.residue_index.shape[-1]\n", " chain_ids = best_unrelaxed_prot.chain_index\n", " for chain_boundary in np.nonzero(chain_ids[:-1] - chain_ids[1:]):\n", - " plt.plot([0, total_num_res], [chain_boundary, chain_boundary], color='red')\n", - " plt.plot([chain_boundary, chain_boundary], [0, total_num_res], color='red')\n", + " if chain_boundary.size:\n", + " plt.plot([0, total_num_res], [chain_boundary, chain_boundary], color='red')\n", + " plt.plot([chain_boundary, chain_boundary], [0, total_num_res], color='red')\n", "\n", " plt.title('Predicted Aligned Error')\n", " plt.xlabel('Scored residue')\n", diff --git a/run_alphafold.py b/run_alphafold.py index 1d5403c1c3ca6fadac7e56513d0af7eb9a735c26..33fae99c8caa732b505afb8f57cad288a18c0dca 100644 --- a/run_alphafold.py +++ b/run_alphafold.py @@ -43,18 +43,18 @@ from alphafold.model import data logging.set_verbosity(logging.INFO) -flags.DEFINE_list('fasta_paths', None, 'Paths to FASTA files, each containing ' - 'a prediction target. Paths should be separated by commas. ' - 'All FASTA paths must have a unique basename as the ' - 'basename is used to name the output directories for ' - 'each prediction.') -flags.DEFINE_list('is_prokaryote_list', None, 'Optional for multimer system, ' - 'not used by the single chain system. ' - 'This list should contain a boolean for each fasta ' - 'specifying true where the target complex is from a ' - 'prokaryote, and false where it is not, or where the ' - 'origin is unknown. These values determine the pairing ' - 'method for the MSA.') +flags.DEFINE_list( + 'fasta_paths', None, 'Paths to FASTA files, each containing a prediction ' + 'target that will be folded one after another. If a FASTA file contains ' + 'multiple sequences, then it will be folded as a multimer. Paths should be ' + 'separated by commas. All FASTA paths must have a unique basename as the ' + 'basename is used to name the output directories for each prediction.') +flags.DEFINE_list( + 'is_prokaryote_list', None, 'Optional for multimer system, not used by the ' + 'single chain system. This list should contain a boolean for each fasta ' + 'specifying true where the target complex is from a prokaryote, and false ' + 'where it is not, or where the origin is unknown. These values determine ' + 'the pairing method for the MSA.') flags.DEFINE_string('data_dir', None, 'Path to directory of supporting data.') flags.DEFINE_string('output_dir', None, 'Path to a directory that will '