Source code for pyrad.lbl.hitran.isotopologues

from collections import namedtuple
from html.parser import HTMLParser
from urllib.request import urlopen

from numpy import float32


Isotopologue = namedtuple("Isotopologue", ["abundance", "id", "mass"])


class _Parser(HTMLParser):
    """Parses the isotopologue properties for a molecule from the HITRAN website.

    Attributes:
        abundance: The abundance of the current isotopologue.
        column: Index of the current column in the HTML table.
        formula: Chemical formula of the current molecule.
        id: HITRAN global id for the isotopologue.
        inside_formula: Flag telling if a molecule is currently being processed.
        inside_row: Flag telling if the parser is currently inside a table row.
        iso_list: List of Isotopologue namedtuples for the current molecule.
        isotopologues: Dictionary mapping a molecule to its list of isotopologue namedtuples.
        mass: Mass of the current isotopologue.
    """
    def __init__(self):
        HTMLParser.__init__(self)
        self.abundance = ""
        self.column = 0
        self.formula = ""
        self.id = ""
        self.inside_formula = False
        self.inside_row = False
        self.iso_list = []
        self.isotopologues = {}
        self.mass = ""

    def handle_data(self, data):
        """Stores data based on the parser's current state.

        Args:
            data: The data contained in the current HTML tag.
        """
        if self.inside_formula:
            self.formula += data.strip()
        elif self.inside_row:
            if self.column == 1:
                self.id += data.strip()
            elif self.column == 5:
                self.abundance += data.strip()
            elif self.column == 6:
                self.mass += data.strip()

    def handle_endtag(self, tag):
        """Resets flags and stores data based on the parser's current state.

        Args:
            tag: Current HTML tag.
        """
        if tag == "h4" and self.inside_formula:
            self.inside_formula = False
            self.formula = self.formula.split(":")[-1].strip()
        elif tag == "tr" and self.inside_row and self.column > 0:
            abundance = self.abundance.replace("\xa0{}\xa010".format(chr(215)), "e")
            self.abundance = ""
            self.iso_list.append(Isotopologue(id=int(self.id), abundance=abundance,
                                              mass=float32(self.mass)))
            self.id = ""
            self.mass = ""
            self.inside_row = False
        elif tag == "tbody":
            self.isotopologues[self.formula] = self.iso_list
            self.formula = ""
            self.iso_list = []

    def handle_starttag(self, tag, attrs):
        """Sets flags based on the parser's current state.

        Args:
            tag: Current HTML tag.
            attrs: List of tuples containing HTML tag attributes.
        """
        if tag == "h4":
            self.inside_formula = True
        elif tag == "tr":
            self.column = 0
            self.inside_row = True
        elif tag == "td" and self.inside_row:
            self.column += 1


[docs]def isotopologues(url):
    parser = _Parser()
    parser.feed(urlopen(url).read().decode("utf-8"))
    return parser.isotopologues