Source code for crystal_builder_step.parse

#!/usr/bin/env python3

"""Process the prototype data from the AFLOW library.

Steps:

1) Manually get the tabular data from the index page
   http://aflowlib.org/prototype-encyclopedia/prototype_index.html

   I did this by opening in Safari, using the Developer module to show the sources,
   open the scripts in th left panel, and downloading table_sort.min.js

   In an editor, I copied the array u from the text 'u=[...],v=[]' by search from the
   bracket in 'u=[' to 'v=[]' and then backup up to closing bracket of the u-array.

   I used black to format it so that I could see the data more nicely. This file is
   'index.json'

   I also had to enclose `Prototype` and `Notes` to make correct JSON. And there are
   a couple places where quoted strings have ", so are quoted with '. I flipped the
   quotes to make "good" json.

   I manually patched the 4 special case in the index.json file::

       "A_hR1_166_a Hg": "A_hR1_166_a.alpha-Hg",
       "A_hR1_166_a Po": "A_hR1_166_a.beta-Po",
       "AB_mP32_14_4e_4e AsS": "AB_mP32_14_4e_4e.realgar",
       "AB_mP32_14_4e_4e AsS": "AB_mP32_14_4e_4e.pararealgar",
       "AB_cP2_221_a_b H4NNO3": "AB_cP2_221_a_b.NH4.NO3",

2) The `download` function then downloads any CIF files to the CIF/ subdirectory. It
   only downloads missing files. The url is like this

   http://aflowlib.org/prototype-encyclopedia/CIF/<AFLOW prototype>.cif
"""

import json
from pathlib import Path
import re
import requests

import CifFile

GreekLetters = {
    r"\\alpha": "\N{Greek Small Letter Alpha}",
    r"\\beta": "\N{Greek Small Letter Beta}",
    r"\\gamma": "\N{Greek Small Letter Gamma}",
    r"\\delta": "\N{Greek Small Letter Delta}",
    r"\\epsilon": "\N{Greek Small Letter Epsilon}",
    r"\\zeta": "\N{Greek Small Letter Zeta}",
    r"\\eta": "\N{Greek Small Letter Eta}",
    r"\\theta": "\N{Greek Small Letter Theta}",
    r"\\iota": "\N{Greek Small Letter Iota}",
    r"\\kappa": "\N{Greek Small Letter Kappa}",
    r"\\lamda": "\N{Greek Small Letter Lamda}",
    r"\\mu": "\N{Greek Small Letter Mu}",
    r"\\nu": "\N{Greek Small Letter Nu}",
    r"\\xi": "\N{Greek Small Letter Xi}",
    r"\\omicron": "\N{Greek Small Letter Omicron}",
    r"\\pi": "\N{Greek Small Letter Pi}",
    r"\\rho": "\N{Greek Small Letter Rho}",
    r"\\sigma": "\N{Greek Small Letter Sigma}",
    r"\\tau": "\N{Greek Small Letter Tau}",
    r"\\upsilon": "\N{Greek Small Letter Upsilon}",
    r"\\phi": "\N{Greek Small Letter Phi}",
    r"\\chi": "\N{Greek Small Letter Chi}",
    r"\\psi": "\N{Greek Small Letter Psi}",
    r"\\omega": "\N{Greek Small Letter Omega}",
    r"\\Alpha": "\N{Greek Capital Letter Alpha}",
    r"\\Beta": "\N{Greek Capital Letter Beta}",
    r"\\Gamma": "\N{Greek Capital Letter Gamma}",
    r"\\Delta": "\N{Greek Capital Letter Delta}",
    r"\\Epsilon": "\N{Greek Capital Letter Epsilon}",
    r"\\Zeta": "\N{Greek Capital Letter Zeta}",
    r"\\Eta": "\N{Greek Capital Letter Eta}",
    r"\\Theta": "\N{Greek Capital Letter Theta}",
    r"\\Iota": "\N{Greek Capital Letter Iota}",
    r"\\Kappa": "\N{Greek Capital Letter Kappa}",
    r"\\Lamda": "\N{Greek Capital Letter Lamda}",
    r"\\Mu": "\N{Greek Capital Letter Mu}",
    r"\\Nu": "\N{Greek Capital Letter Nu}",
    r"\\Xi": "\N{Greek Capital Letter Xi}",
    r"\\Omicron": "\N{Greek Capital Letter Omicron}",
    r"\\Pi": "\N{Greek Capital Letter Pi}",
    r"\\Rho": "\N{Greek Capital Letter Rho}",
    r"\\Sigma": "\N{Greek Capital Letter Sigma}",
    r"\\Tau": "\N{Greek Capital Letter Tau}",
    r"\\Upsilon": "\N{Greek Capital Letter Upsilon}",
    r"\\Phi": "\N{Greek Capital Letter Phi}",
    r"\\Chi": "\N{Greek Capital Letter Chi}",
    r"\\Psi": "\N{Greek Capital Letter Psi}",
    r"\\Omega": "\N{Greek Capital Letter Omega}",
}

subscripts = {
    "0": "\N{Subscript Zero}",
    "1": "\N{Subscript One}",
    "2": "\N{Subscript Two}",
    "3": "\N{Subscript Three}",
    "4": "\N{Subscript Four}",
    "5": "\N{Subscript Five}",
    "6": "\N{Subscript Six}",
    "7": "\N{Subscript Seven}",
    "8": "\N{Subscript Eight}",
    "9": "\N{Subscript Nine}",
    "+": "\N{Subscript Plus Sign}",
    "-": "\N{Subscript Minus}",
    "=": "\N{Subscript Equals Sign}",
    "(": "\N{Subscript Left Parenthesis}",
    ")": "\N{Subscript Right Parenthesis}",
    "a": "\N{Latin Subscript Small Letter A}",
    "e": "\N{Latin Subscript Small Letter E}",
    "o": "\N{Latin Subscript Small Letter O}",
    "x": "\N{Latin Subscript Small Letter X}",
    "h": "\N{Latin Subscript Small Letter H}",
    "k": "\N{Latin Subscript Small Letter K}",
    "l": "\N{Latin Subscript Small Letter L}",
    "m": "\N{Latin Subscript Small Letter M}",
    "n": "\N{Latin Subscript Small Letter N}",
    "p": "\N{Latin Subscript Small Letter P}",
    "s": "\N{Latin Subscript Small Letter S}",
    "t": "\N{Latin Subscript Small Letter T}",
    ".": ".",
}

references = """
@Misc{ crystal_builder_step,
        author = {Paul Saxe},
        title = {Crystal Builder plug-in for SEAMM},
        month = {$month},
        year = {$year},
        organization = {The Molecular Sciences Software Institute (MolSSI)},
        url = {https://github.com/molssi-seamm/crystal_builder_step},
        address = {Virginia Tech, Blacksburg, VA, USA},
        version = {$version}
}

@article{MEHL2017S1,
    title = "The AFLOW Library of Crystallographic Prototypes: Part 1",
    journal = "Computational Materials Science",
    volume = "136",
    pages = "S1 - S828",
    year = "2017",
    issn = "0927-0256",
    doi = "https://doi.org/10.1016/j.commatsci.2017.01.017",
    url = "http://www.sciencedirect.com/science/article/pii/S0927025617300241",
    author = "Michael J. Mehl and David Hicks and Cormac Toher and Ohad Levy and Robert M. Hanson and Gus Hart and Stefano Curtarolo",
    keywords = "Crystal Structure, Space Groups, Wyckoff Positions, Lattice Vectors, Basis Vectors, Database",
}

@article{HICKS2019S1,
    title = "The AFLOW Library of Crystallographic Prototypes: Part 2",
    journal = "Computational Materials Science",
    volume = "161",
    pages = "S1 - S1011",
    year = "2019",
    issn = "0927-0256",
    doi = "https://doi.org/10.1016/j.commatsci.2018.10.043",
    url = "http://www.sciencedirect.com/science/article/pii/S0927025618307146",
    author = "David Hicks and Michael J. Mehl and Eric Gossett and Cormac Toher and Ohad Levy and Robert M. Hanson and Gus Hart and Stefano Curtarolo",
    keywords = "Crystal Structure, Space Groups, Wyckoff Positions, Lattice Vectors, Basis Vectors, Database",
}

@article{HICKS2021110450,
    title = {The AFLOW Library of Crystallographic Prototypes: Part 3},
    journal = {Computational Materials Science},
    volume = {199},
    pages = {110450},
    year = {2021},
    issn = {0927-0256},
    doi = {https://doi.org/10.1016/j.commatsci.2021.110450},
    url = {https://www.sciencedirect.com/science/article/pii/S0927025621001750},
    author = {David Hicks and Michael J. Mehl and Marco Esters and Corey Oses and Ohad Levy and Gus L.W. Hart and Cormac Toher and Stefano Curtarolo},
    keywords = {Crystal Structure, Space Groups, Wyckoff Positions, Lattice Vectors, Basis Vectors, Database},
    abstract = {The AFLOW Library of Crystallographic Prototypes has been extended to include a total of 1,100 common crystal structural prototypes (510 new ones with Part 3), comprising all of the inorganic crystal structures defined in the seven-volume Strukturbericht series published in Germany from 1937 through 1943. We cover a history of the Strukturbericht designation system, the evolution of the system over time, and the first comprehensive index of inorganic Strukturbericht designations ever published.}
}"""  # noqa: E501


[docs]def clean_text(text): """Replace subscripts and other special characters.""" text = re.sub(r"&ndash;", "\N{En Dash}", text) text = re.sub(r"&frasl;", "\N{Fraction Slash}", text) text = re.sub(r"&middot;", "\N{Middle Dot}", text) text = re.sub(r"&cacute;", "\N{Latin Small Letter C with Acute}", text) text = re.sub(r"&auml;", "\N{Latin Small Letter A with Diaeresis}", text) text = re.sub(r"&uuml;", "\N{Latin Small Letter U with Diaeresis}", text) text = re.sub(r"&approx;", "\N{Almost Equal To}", text) text = re.sub(r"<em>([^<]+)</em>", r"*\1*", text) text = re.sub(r"<sub>([^<]+)</sub>", subscript_helper, text) text = re.sub(r"<q>([^<]+)</q>", r'"\1"', text) text = re.sub( r"<sup>II</sup>", "\N{Modifier Letter Capital I}\N{Modifier Letter Capital I}", text, ) text = text.replace("\u00a0", " ") text = math_mode(text) return text.strip()
[docs]def subscript_helper(match): result = "" for char in match.group(1): if char in subscripts: result += subscripts[char] else: # print(f"There is no unicode subscript for '{char}'") result += char return result
[docs]def entry_to_string(cif, key): """Convert a cif entry to a string.""" if key not in cif: return None if isinstance(cif[key], str): return cif[key].strip() lines = [] for line in cif[key]: line = line.strip() if line != "": lines.append(line) return " ".join(lines)
[docs]def simple_spacegroup(text): """Translate LATeX math mode to Unicode.""" text = text.strip("$") text = re.sub(r"_\{([^}]+)}", r"\1", text) text = re.sub(r"\\bar{(.)}", r"-\1", text) return text
[docs]def math_mode(text): """Translate LATeX math mode to Unicode.""" text = re.sub(r"_\{([^}]+)}", subscript_helper, text) text = re.sub(r"\$([^$]+)\$", math_mode_helper, text) text = re.sub(r"\\le", "\N{Less-Than or Equal To}", text) return text
[docs]def math_mode_helper(match): text = match.group(1) text = re.sub(r"\\bar{(.)}", "\\1\N{Combining Overline}", text) for pattern, replacement in GreekLetters.items(): text = re.sub(pattern, replacement, text) return text
[docs]def to_bibtex(prototype): """Extract the reference information and transform to BibTeX.""" path = Path("CIF") / (prototype + ".cif") with open(path) as fd: data = CifFile.ReadCif(fd) lines = [] for _, cif in data.items(): # pprint.pprint({**cif}) if "_publ_author_name" not in cif: continue lines.append("@article{" + prototype) tmp = cif["_publ_author_name"] if isinstance(tmp, str): lines.append(" author = {" + tmp + "}") else: lines.append(" author = {" + " and ".join(tmp) + "}") title = entry_to_string(cif, "_publ_Section_title") if title is not None: lines.append(" title = {" + title + "}") journal = entry_to_string(cif, "_journal_name_full_name") if journal is not None: lines.append(" journal = {" + journal + "}") volume = entry_to_string(cif, "_journal_volume") if volume is not None: lines.append(" volume = {" + volume + "}") page_first = entry_to_string(cif, "_journal_page_first") page_last = entry_to_string(cif, "_journal_page_last") if page_first is not None: if page_last is None: lines.append(" pages = {" + page_first + "}") else: lines.append(" pages = {" + page_first + "--" + page_last + "}") year = entry_to_string(cif, "_journal_year") if year is not None: lines.append(" year = " + year) lines.append("}") return ",\n".join(lines)
if __name__ == "__main__": # pragma: no cover # Read the array of prototypes. with open("index.json", "r") as fd: prototypes = json.load(fd) print(f"There are {len(prototypes)} prototypes.") print("Downloading any CIF files that are missing.") # Download the CIF files cifdir = Path("CIF") cifdir.mkdir(exist_ok=True) i = 0 pset = set() for data in prototypes: name = data["Prototype"] # Ufff. Name may have html <sub> </sub> name = name.replace("<sub>", "").replace("</sub>", "") prototype = data["AFLOW Prototype"] if prototype in pset: # print(f"{prototype} is a duplicate") pass else: pset.add(prototype) ciffile = cifdir / (prototype + ".cif") i += 1 if not ciffile.exists(): print(f" {i:4} Downloading CIF for {prototype} {name}") url = f"http://aflowlib.org/prototype-encyclopedia/CIF/{prototype}.cif" r = requests.get(url) if r.status_code == requests.codes.ok: ciffile.write_text(r.text) else: # It appears that when there are multiple entries, the protoype is # appended url = ( f"http://aflowlib.org/prototype-encyclopedia/CIF/{prototype}.{name}" ".cif" ) r = requests.get(url) if r.status_code == requests.codes.ok: ciffile.write_text(r.text) else: print(f" status = {r.status_code}") r.close() # Process the citations path = cifdir / "references.bib" print(f"Writing references to {path}.") with open(path, "w") as fd: fd.write(references) pset = set() for data in prototypes: prototype = data["AFLOW Prototype"] if prototype not in pset: pset.add(prototype) try: text = to_bibtex(prototype) except Exception as e: print(f"Error in {prototype}: {str(e)}") continue if text is None: print(f"No reference information for {prototype}.") else: fd.write(text) fd.write("\n\n") # Now create the metadata for the plug-in. path = Path("prototypes.json") print(f"Writing the metadata to {path}.") jdata = {} max_sites = 0 data = {} tmp = [] data["prototype"] = [clean_text(d["Prototype"]) for d in prototypes] data["nSpecies"] = [d["# Species"] for d in prototypes] data["nAtoms"] = [d["# Atoms"] for d in prototypes] data["Pearson symbol"] = [d["Pearson Symbol"] for d in prototypes] data["Strukturbericht designation"] = [ None if d["Struk. Design."] == "None" else clean_text(d["Struk. Design."]) for d in prototypes ] data["AFLOW prototype"] = [d["AFLOW Prototype"] for d in prototypes] data["space group symbol"] = [ clean_text(d["Space Group Symbol"]) for d in prototypes ] data["space group number"] = [d["Space Group Number"] for d in prototypes] data["notes"] = [clean_text(d["Notes"]) for d in prototypes] data["hyperlink"] = [ f"http://www.aflowlib.org/CrystalDatabase/{d['AFLOW Prototype']}.html" for d in prototypes ] # Add cell and site info. cp = data["cell parameters"] = [] sites = data["sites"] = [] pset = set() for pdata in prototypes: prototype = pdata["AFLOW Prototype"] if prototype not in pset: pset.add(prototype) cif_path = Path("CIF") / (prototype + ".cif") with open(cif_path, "r") as fd: try: tmp = CifFile.ReadCif(fd) except Exception as e: print(f"Problem with {cif_path}, {e}") continue if tmp is None: print(f"Problem with {cif_path}") continue for _, cif in tmp.items(): if "_aflow_params" not in cif: print(f"No aflow parameters in {prototype}.") else: params = cif["_aflow_params"].split(",") values = cif["_aflow_params_values"].split(",") cell_parameters = [] adjustable = { "x": {}, "y": {}, "z": {}, } if "_aflow_params" in cif: for param, value in zip(params, values): if param[0] in ("x", "y", "z"): if param[1:3] == "_{": site_no = int(param[3:-1]) adjustable[param[0]][site_no] = value else: site_no = int(param[1:]) adjustable[param[0]][site_no] = value elif param[0] == "\\": cell_parameters.append((param[1:], value)) elif param == "b/a": cell_parameters.append(("b", cif["_cell_length_b"])) elif param == "c/a": cell_parameters.append(("c", cif["_cell_length_c"])) else: cell_parameters.append((param, value)) cp.append(cell_parameters) site_data = [] site_no = 0 for symbol, site, mult, x, y, z in zip( cif["_atom_site_type_symbol"], cif["_atom_site_Wyckoff_label"], cif["_atom_site_symmetry_multiplicity"], cif["_atom_site_fract_x"], cif["_atom_site_fract_y"], cif["_atom_site_fract_z"], ): site_no += 1 if site_no in adjustable["x"]: x = adjustable["x"][site_no] xvar = True else: xvar = False if site_no in adjustable["y"]: y = adjustable["y"][site_no] yvar = True else: yvar = False if site_no in adjustable["z"]: z = adjustable["z"][site_no] zvar = True else: zvar = False site_data.append( (site, int(mult), symbol, x, xvar, y, yvar, z, zvar) ) sites.append(site_data) n_sites = len(site_data) max_sites = max_sites if n_sites < max_sites else n_sites description = clean_text(pdata["Notes"]) if description[-10:] == " Structure": description = description[0:-10] jdata[prototype] = { "prototype": clean_text(pdata["Prototype"]), "n_elements": pdata["# Species"], "n_atoms": pdata["# Atoms"], "pearson_symbol": pdata["Pearson Symbol"], "strukturbericht": ( None if pdata["Struk. Design."] == "None" else clean_text(pdata["Struk. Design."]) ), "aflow": prototype, "simple_spacegroup": simple_spacegroup(pdata["Space Group Symbol"]), "spacegroup": clean_text(pdata["Space Group Symbol"]), "spacegroup_number": pdata["Space Group Number"], "description": description, "hyperlink": ( f"http://www.aflowlib.org/CrystalDatabase/{prototype}.html" ), "cell": cell_parameters, "sites": site_data, "n_sites": n_sites, } with open(path, "w") as fd: json.dump(jdata, fd, indent=4)