Source code for pyrad.lbl.hitran.molecules

from html.parser import HTMLParser
from urllib.request import urlopen


class _Parser(HTMLParser):
    """Parses HITRAN molecule ids from the HITRAN website.

    Attributes:
        column: Current column in the html table.
        formula: Chemical formula of the current molecule.
        id: HITRAN integer identifier of the current molecule.
        inside_row: Flag telling if the parser is inside a row of a HTML table.
        inside_table: Flag telling if the parser is inside the correct HTML table.
        molecule_ids: Dictionary mapping chemical formulae to their HITRAN ids.
    """
    def __init__(self):
        HTMLParser.__init__(self)
        self.column = 0
        self.formula = ""
        self.id = ""
        self.inside_row = False
        self.inside_table = False
        self.molecule_ids = {}

    def handle_data(self, data):
        """Stores the current HITRAN id or chemical symbol, based on the parser's current state.

        Args:
            data: The data contained in the current HTML tag.
        """
        if self.inside_row:
            if self.column == 1:
                self.id += data.strip()
            elif self.column == 2:
                self.formula += data.strip()

    def handle_endtag(self, tag):
        """Resets flags and stores data based on the parser's current state.

        Args:
           tag: Current HTML tag.
        """
        if tag == "table" and self.inside_table:
            self.inside_table = False
        if tag == "tr" and self.inside_row and self.column > 0:
            self.molecule_ids[self.formula] = int(self.id)
            self.formula = ""
            self.id = ""
            self.inside_row = False

    def handle_starttag(self, tag, attrs):
        """Sets flags based on the parser's current state.

        Args:
            tag: Current HTML tag.
            attrs: List of tuples containing HTML tag attributes.
        """
        if tag == "table" and ("class", "list-table") in attrs:
            self.inside_table = True
        elif tag == "tr" and self.inside_table:
            self.column = 0
            self.inside_row = True
        elif tag == "td" and self.inside_row:
            self.column += 1


[docs]def molecules(url): """Creates a dictionary mapping molecular chemical formulae to HITRAN ids. Args: URL to HITRAN molecules table. Returns: A dictionary mapping molecular chemical formulae to HITRAN ids. """ parser = _Parser() parser.feed(urlopen(url).read().decode("utf-8")) return parser.molecule_ids