diff --git a/aiida_base.ipynb b/aiida_base.ipynb new file mode 100644 index 0000000..6f0f4fd --- /dev/null +++ b/aiida_base.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# AiiDA \n", + "This implementation uses AiiDA to implement the equation of state workflow with Quantum ESPRESSO.\n", + "It intentionally provides a minimal implementation for the sake of simplicity:\n", + "\n", + "* Quantum ESPRESSO is run locally, but could be run on a remote cluster\n", + "* Not all steps of the procedure are captured by provenance, just the Quantum ESPRESSO calculations\n", + "* The entire procedure is not yet wrapped up in a single workflow\n", + "* The SCF calculations are run serially but could be run in parallel\n", + "\n", + "## Installation and setup\n", + "```\n", + "pip install aiida-core[atomic-tools]\n", + "pip install aiida-shell\n", + "verdi presto\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Profile" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from aiida import load_profile\n", + "\n", + "load_profile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Implementation of a new simulation code\n", + "\n", + "AiiDA allows developers to implement complex interfaces for any external code, with features such as input validation, error handling etc.\n", + "However, if you just want to run an external code without too much faff, you can use `aiida-shell` to do so." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from ase.build import bulk\n", + "from aiida.orm import SinglefileData\n", + "\n", + "from functions import write_input\n", + "\n", + "FILEPATH_PSEUDOS = Path.cwd() / 'espresso' / 'pseudo'\n", + "pseudopotentials = {\"Al\": \"Al.pbe-n-kjpaw_psl.1.0.0.UPF\"}\n", + "\n", + "input_string = write_input(\n", + " input_dict={\n", + " \"structure\": bulk('Al'), \n", + " \"pseudopotentials\": pseudopotentials, \n", + " \"kpts\": (3, 3, 3),\n", + " \"calculation\": \"vc-relax\",\n", + " \"smearing\": 0.02,\n", + " },\n", + " return_string=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Although the `pw.x` executable can be run without any parsing, let's add a parser that converts the outputs into AiiDA nodes to improve the richness of the provenance:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'xmlschema'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01maiida\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m orm\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01madis_tools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mparsers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m parse_pw\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpw_parser\u001b[39m(dirpath):\n\u001b[1;32m 6\u001b[0m parsed_data \u001b[38;5;241m=\u001b[39m parse_pw(dirpath \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpwscf.xml\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "File \u001b[0;32m~/aiida_projects/adis/git-repos/ADIS2023/adis_tools/parsers.py:3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxmlschema\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m XMLSchema\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mqe_tools\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CONSTANTS\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Atoms\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'xmlschema'" + ] + } + ], + "source": [ + "from aiida import orm\n", + "from adis_tools.parsers import parse_pw\n", + "\n", + "def pw_parser(dirpath):\n", + " \n", + " parsed_data = parse_pw(dirpath / 'pwscf.xml')\n", + "\n", + " return {\n", + " 'structure': orm.StructureData(ase=parsed_data['ase_structure']),\n", + " 'energy': orm.Float(parsed_data['energy']),\n", + " 'volume': orm.Float(parsed_data['ase_structure'].get_volume()),\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now it's time to run the relaxation:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from aiida_shell import launch_shell_job\n", + "\n", + "results_relax, node_relax = launch_shell_job(\n", + " 'pw.x',\n", + " arguments='-in {input_file}',\n", + " parser=pw_parser,\n", + " nodes={\n", + " 'input_file': SinglefileData.from_string(input_string, filename='input.pwi'),\n", + " },\n", + " outputs=['pwscf.xml', ],\n", + " metadata={\n", + " 'options': {\n", + " 'prepend_text': f'export ESPRESSO_PSEUDO {FILEPATH_PSEUDOS.as_posix()}',\n", + " 'redirect_stderr': True\n", + " }\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EOS" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from functions import generate_structures\n", + "\n", + "scaled_structures = generate_structures(\n", + " results_relax['structure'].get_ase(), [0.9, 0.95, 1.0, 1.05, 1.1])\n", + "\n", + "structures = []\n", + "energies = []\n", + "\n", + "scf_inputs = {\n", + " \"pseudopotentials\": pseudopotentials, \n", + " \"kpts\": (3, 3, 3),\n", + " \"calculation\": \"scf\",\n", + " \"smearing\": 0.02,\n", + "}\n", + "\n", + "for scaled_structure in scaled_structures:\n", + "\n", + " scf_inputs['structure'] = scaled_structure\n", + " input_string = write_input(scf_inputs, return_string=True)\n", + "\n", + " results_scf, node_scf = launch_shell_job(\n", + " 'pw.x',\n", + " arguments='-in {input_file}',\n", + " parser=pw_parser,\n", + " nodes={\n", + " 'input_file': SinglefileData.from_string(input_string, filename='input.pwi'),\n", + " },\n", + " outputs=['pwscf.xml', ],\n", + " metadata={\n", + " 'options': {\n", + " 'prepend_text': f'export ESPRESSO_PSEUDO {FILEPATH_PSEUDOS.as_posix()}',\n", + " 'redirect_stderr': True\n", + " }\n", + " }\n", + " )\n", + "\n", + " structures.append(results_scf['structure'].get_ase())\n", + " energies.append(results_scf['energy'].value)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "volumes = [structure.get_volume() for structure in structures]\n", + "\n", + "plt.plot(volumes, energies)\n", + "plt.xlabel('Volume')\n", + "plt.ylabel('Energy')\n", + "plt.savefig('evcurve.png')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submission to HPC - up-scaling for high throughput screening " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "AiiDA allows registering remote HPC compute resources in its database, which can then be used for submitting simulations. Registration can be achieved via the CLI with `verdi computer setup`, followed by `verdi computer configure`, or via the Python API (see below). With the first command, the database entry for the `Computer` is created (and the provided information immutably stored in the DB), while the second command _configures_ the compute resource, which are, among others, the SSH connection parameters.\n", + "\n", + "AiiDA provides interfaces for various _transport technologies_, most notably SSH, implemented, as well as the most common schedulers, such as SLURM or SGE. This means that after successful registration in the database, using such HPC resources is as simple as changing a single string in the submission script that specifies the computer where the calculation should be run. When a simulation is started, AiiDA takes care to communicate with the HPC and upload all required files, as well as generate the submission script. In addition to the remote compute resource, also the codes (executables) have to be registered to such that they can then be used to run simulations.\n", + "\n", + "One advantage of this design, among others, is the fact that utilizing different HPCs, even within one workflow, can be easily achieved by using the corresponding identifiers of the compute resources. In addition, the software environment does _not_ have to be duplicated on the HPC, but AiiDA instead runs locally. However, this means that compute resources first have to be correctly registered in AiiDA, which can present an initial barrier for new users. The following code snippets show how a computer and a code can be registered via the Python API:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The `get_or_create` method allows keeping the `Computer` generation atomic within this notebook\n", + "from aiida import orm\n", + "\n", + "created, computer = orm.Computer.collection.get_or_create(\n", + " label=\"Todi\",\n", + " description=\"New Todi HPC at CSCS\",\n", + " hostname=\"todi\",\n", + " workdir=\"/scratch//aiida\",\n", + " transport_type=\"core.ssh\",\n", + " scheduler_type=\"core.slurm\",\n", + ")\n", + "\n", + "if created:\n", + " computer.store()\n", + " computer.set_minimum_job_poll_interval(10.0)\n", + " computer.set_default_mpiprocs_per_machine(128)\n", + " computer.set_append_text('')\n", + " computer.set_prepend_text('')\n", + " computer.configure()\n", + "\n", + "print(computer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Register a code for later use \n", + "try:\n", + " code = aiida.orm.load_code(\"pw.x@todi\")\n", + "except aiida.common.NotExistent:\n", + " code = aiida.orm.Code(\n", + " computer=load_computer('todi'),\n", + " remote_computer_exec=[computer, data.pwx_path],\n", + " )\n", + " code.label = \"pw.x\"\n", + " code.description = \"Quantum ESPRESSO pw.x code\"\n", + " code.set_prepend_text(\"export OMP_NUM_THREADS=1\")\n", + " code.store()\n", + "code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For `aiida-shell` the computer can directly be passed to the metadata options of the simulation\n", + "# This is because the executable is directly passed as the first argument to the `launch_shell_job` function\n", + "# And the path is resolved internally and a `Code` entity created\n", + "metadata={\n", + " 'options': {\n", + " 'prepend_text': f'export ESPRESSO_PSEUDO {FILEPATH_PSEUDOS.as_posix()}',\n", + " 'redirect_stderr': True\n", + " 'computer': load_computer(''),\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cache Workflow Steps\n", + "\n", + "AiiDA provides caching functionality out of the box. \n", + "By default, caching is turned off, however, it can be globally enabled via the CLI with the command:\n", + "`verdi config set caching.default_enabled True` \n", + "as well as for individual simulation types with, e.g.\n", + "`verdi config set caching.enabled_for aiida.calculations:quantumespresso.pw`.\n", + "\n", + "To achieve this, all nodes are automatically hashed, and the hashes saved in the _extras_ of the respective Nodes once they are stored in the database:\n", + "```python\n", + "from aiida import orm\n", + "node = orm.Int(1)\n", + "node.store()\n", + "\n", + "print(node.base.caching.get_hash())\n", + "```\n", + "\n", + "Caching is then made possible by comparing the hashes of the `ProcessNode` of a given simulation that is to be executed, including all its input `Data` nodes, and comparing those with the existing entries in the DB. If the exact same simulation has already been carried out successfully, the corresponding nodes are duplicated and inserted into the workflow, rather than running the same simulation again. While this saves computational time, it does not necessarily save space, due to the duplication of data.\n", + "\n", + "There are a few details to keep in mind for caching, because, as the saying goes _\"There are only two hard things in Computer Science: cache invalidation and naming things.\"_ For one, a choice has to be made which data should be included when calculating the hash. E.g., which input nodes should be considered for calculating the hash of a Process? Or to which precision should the AiiDA, as well as the plugin version be included for evaluating the hash? We can assume that results between different major versions actually _are_ different, but what about minor and patch versions? Overall, a balance has to be found between minimizing the chance of false negatives, where two nodes should have the same hash but do not, and false positives, where two different nodes get the same hash by mistake. For instance, if every patch version would lead to different hashes, despite the fact that results can be expected to be the same, the feature would not be very useful. Furthermore, it is not immediately clear how far a workflow should be traced to generate the hash of a given Data node that it produced." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Keep Track of the Execution\n", + "\n", + "As provenance is one of the principal features behind AiiDA, great care has been taken during its development to ensure that the execution of a workflow is being tracked and that data can always be traced back. In addition, AiiDA implements checkpointing, meaning that each successful step completed during the execution of a workflow is recorded in the database. This allows restarting a running workflow, even after the workstation that runs AiiDA has been shut down or an SSH connection to an HPC has dropped." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Organisation\n", + "\n", + "AiiDA stores data in an SQL database, as well as an internal file repository. The former is used for input/parsed output Data nodes of well-defined types, such as `str` or `int` (using the corresponding AiiDA data types), while raw files are stored in the file repository (as they are often too large to store the contents in the SQL database).\n", + "\n", + "To organize data, AiiDA provides the concepts of _Groups_ that can be used to create collections of specific data nodes, such as, for example, all crystal structures used for a study, or all band structure simulations (but groups with mixed types are of course possible). As AiiDA further stores plenty of useful metadata (such as the creation and last modification time), one can easily query and/or filter data even years after its creation. Without a computational infrastructure like AiiDA, it can be otherwise difficult to navigate files and folders. Of course, also with AiiDA, proper organization using groups, and, possibly, adding additional metadata to the Nodes is just as helpful as keeping a logical directory structure when working with files and folders." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sharing Workflows\n", + "\n", + "Lastly, once all required simulations have been conducted AiiDA, thus concluding a research project, all data can be exported using the `verdi archive create` CLI endpoint. Here, collections of data nodes, groups, or other entities can be selected for exporting (in addition to _all_ profile data). These archives can then be shared with other researchers, who can import them into their AiiDA profiles.\n", + "\n", + "While the data of a concrete, executed workflow is contained in the exported archives, the workflow logic cannot be shared in the same way. Instead, however, it is defined by the Python source code that was used to set up the workflow. These workflows are usually grouped together (with other required code infrastructure), in AiiDA-plugins, of which an extensive collection is available to the general public in the [AiiDA plugin registry](https://aiidateam.github.io/aiida-registry/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "b" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/functions.py b/functions.py new file mode 100644 index 0000000..061ea40 --- /dev/null +++ b/functions.py @@ -0,0 +1,74 @@ +import os + +from ase import Atoms +from ase.io import write +from adis_tools.parsers import parse_pw +import matplotlib.pyplot as plt + + +def generate_structures( + structure: Atoms, + strain_lst: list + ) -> list[Atoms]: + + structure_lst = [] + + for strain in strain_lst: + structure_strain = structure.copy() + structure_strain.set_cell( + structure_strain.cell * strain**(1/3), + scale_atoms=True + ) + structure_lst.append(structure_strain) + + return structure_lst + + +def append_to_list(lst: list, item: float): + lst.append(item) + + +def split_string(string: str, character: str) -> list: + return string.split(character) + + +def write_input(input_dict, working_directory=".", return_string=False): + + filename = os.path.join(working_directory, 'input.pwi') + + os.makedirs(working_directory, exist_ok=True) + + write( + filename=filename, + images=input_dict["structure"], + Crystal=True, + kpts=input_dict["kpts"], + input_data={ + 'calculation': input_dict["calculation"], + 'occupations': 'smearing', + 'degauss': input_dict["smearing"], + }, + pseudopotentials=input_dict["pseudopotentials"], + tstress=True, + tprnfor=True + ) + + if return_string: + with open(filename) as f: + return f.read() + + +def collect_output(working_directory="."): + output = parse_pw(os.path.join(working_directory, 'pwscf.xml')) + return { + "structure": output['ase_structure'], + "energy": output["energy"], + "volume": output['ase_structure'].get_volume(), + } + + +def plot_energy_volume_curve(volume_lst, energy_lst): + plt.plot(volume_lst, energy_lst) + plt.xlabel("Volume") + plt.ylabel("Energy") + plt.savefig("evcurve.png")