Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CDMS: add whole-molecule query tool & refactor metadata acquisition #3173

Merged
merged 1 commit into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ New Tools and Services
Service fixes and enhancements
------------------------------

linelists.cdms
^^^^^^^^^^^^^^

- Add whole catalog retrieval, improve error messaging for unparseable lines,
improve metadata catalog, and improve lookuptable behavior [#3173,#2901]

jplspec
^^^^^^^

- minor improvement to lookuptable behavior [#3173,#2901]


Infrastructure, Utility and Other Changes and Additions
-------------------------------------------------------
Expand Down
17 changes: 10 additions & 7 deletions astroquery/jplspec/lookup_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

class Lookuptable(dict):

def find(self, s, flags):
def find(self, st, flags):
keflavich marked this conversation as resolved.
Show resolved Hide resolved
"""
Search dictionary keys for a regex match to string s

Parameters
----------
s : str
st : str
String to compile as a regular expression
Can be entered non-specific for broader results
('H2O' yields 'H2O' but will also yield 'HCCCH2OD')
Expand All @@ -22,17 +22,20 @@ def find(self, s, flags):

Returns
-------
The list of values corresponding to the matches
The dictionary containing only values whose keys match the regex

"""

R = re.compile(s, flags)
if st in self:
return {st: self[st]}

R = re.compile(st, flags)

out = {}

for k, v in self.items():
match = R.search(str(k))
for key, val in self.items():
match = R.search(str(key))
if match:
out[k] = v
out[key] = val

return out
12 changes: 12 additions & 0 deletions astroquery/linelists/cdms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,25 @@ class Conf(_config.ConfigNamespace):
Configuration parameters for `astroquery.linelists.cdms`.
"""
server = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/',
'CDMS Search and Conversion Form URL.')

search = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/cgi-bin/cdmssearch',
'CDMS Search and Conversion Form URL.')

catfile_url = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/classic/entries/partition_function.html',
'CDMS partition function table listing all available molecules.')

catfile_url2 = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/classic/predictions/catalog/catdir.html',
'CDMS catalog table listing all available molecules (with different names from partition function).')

classic_server = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/classic',
'CDMS Classic Molecule List server.')

timeout = _config.ConfigItem(
60,
'Time limit for connecting to the CDMS server.')
Expand Down
222 changes: 208 additions & 14 deletions astroquery/linelists/cdms/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from bs4 import BeautifulSoup
import astropy.units as u
from astropy import table
from astropy.io import ascii
from astroquery.query import BaseQuery
from astroquery.utils import async_to_sync
Expand All @@ -26,8 +27,11 @@ def data_path(filename):
@async_to_sync
class CDMSClass(BaseQuery):
# use the Configuration Items imported from __init__.py
URL = conf.server
URL = conf.search
SERVER = conf.server
CLASSIC_URL = conf.classic_server
TIMEOUT = conf.timeout
MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS', '028582 H2NC', '058501 H2C2S', '064527 HC3HCN']

def query_lines_async(self, min_frequency, max_frequency, *,
min_strength=-500, molecule='All',
Expand Down Expand Up @@ -143,8 +147,6 @@ def query_lines_async(self, min_frequency, max_frequency, *,
else:
payload['Molecules'] = molecule

payload = list(payload.items())

if get_query_payload:
return payload
# BaseQuery classes come with a _request method that includes a
Expand All @@ -170,6 +172,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
response2 = self._request(method='GET', url=fullurl,
timeout=self.TIMEOUT, cache=cache)

# accounts for three formats, e.g.: '058501' or 'H2C2S' or '058501 H2C2S'
badlist = (self.MALFORMATTED_MOLECULE_LIST + # noqa
[y for x in self.MALFORMATTED_MOLECULE_LIST for y in x.split()])
if payload['Molecules'] in badlist:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say drop line 146, as request can deal with a dict data input, but you cannot index it if you turn in to a list of tuples. A dictionary would be anyways better as it would be consistent with our docstrings (even though upstream accepts more types right now).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I did this, I hope it worked.... I ran into more local configuration issues w/pytest that affect only astroquery. grumble.

raise ValueError(f"Molecule {payload['Molecules']} is known not to comply with standard CDMS format. "
f"Try get_molecule({payload['Molecules']}) instead.")

return response2

def _parse_result(self, response, *, verbose=False):
Expand Down Expand Up @@ -278,8 +287,9 @@ def _parse_result(self, response, *, verbose=False):

return result

def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
catfile_url=conf.catfile_url):
def get_species_table(self, *, catfile='partfunc.cat', use_cached=True,
catfile_url=conf.catfile_url,
catfile2='catdir.cat', catfile_url2=conf.catfile_url2):
"""
A directory of the catalog is found in a file called 'catdir.cat.'

Expand All @@ -302,9 +312,35 @@ def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
"""

if use_cached:
result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
try:
result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
result2 = ascii.read(data_path(catfile2), format='fixed_width', delimiter='|')
except UnicodeDecodeError:
with open(data_path(catfile), 'rb') as fh:
content = fh.read()
text = content.decode('ascii', errors='replace')
result = ascii.read(text, format='basic', delimiter='|')
with open(data_path(catfile2), 'rb') as fh:
content = fh.read()
text = content.decode('ascii', errors='replace')
result2 = ascii.read(text, format='basic', delimiter='|')
else:
result = retrieve_catfile(catfile_url)
result2 = retrieve_catfile2(catfile_url2)
result.write(data_path(catfile), format='ascii.fixed_width', delimiter='|', overwrite=True)
result2.write(data_path(catfile2), format='ascii.fixed_width', delimiter='|', overwrite=True)

merged = table.join(result, result2, keys=['tag'])
if not all(merged['#lines'] == merged['# lines']):
raise ValueError("Inconsistent table of molecules from CDMS.")
del merged['# lines']

# reorder columns
result = merged[['tag', 'molecule', 'Name', '#lines', 'lg(Q(1000))',
'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))', 'lg(Q(150))', 'lg(Q(75))',
'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
'lg(Q(2.725))',
'Ver.', 'Documentation', 'Date of entry', 'Entry']]

meta = {'lg(Q(1000))': 1000.0,
'lg(Q(500))': 500.0,
Expand All @@ -331,6 +367,96 @@ def tryfloat(x):
result.meta = {'Temperature (K)': [1000., 500., 300., 225., 150., 75.,
37.5, 18.75, 9.375, 5., 2.725]}

result.add_index('tag')

return result

def get_molecule(self, molecule_id, *, cache=True):
"""
Retrieve the whole molecule table for a given molecule id
"""
if not isinstance(molecule_id, str) or len(molecule_id) != 6:
raise ValueError("molecule_id should be a length-6 string of numbers")
url = f'{self.CLASSIC_URL}/entries/c{molecule_id}.cat'
response = self._request(method='GET', url=url,
timeout=self.TIMEOUT, cache=cache)
result = self._parse_cat(response)

species_table = self.get_species_table()
result.meta = dict(species_table.loc[int(molecule_id)])

return result

def _parse_cat(self, response, *, verbose=False):
"""
Parse a catalog response into an `~astropy.table.Table`

See details in _parse_response; this is a very similar function,
but the catalog responses have a slightly different format.
"""

if 'Zero lines were found' in response.text:
raise EmptyResponseError(f"Response was empty; message was '{response.text}'.")

text = response.text

# notes about the format
# [F13.4, 2F8.4, I2, F10.4, I3, I7, I4, 12I2]: FREQ, ERR, LGINT, DR, ELO, GUP, TAG, QNFMT, QN noqa
# 13 21 29 31 41 44 51 55 57 59 61 63 65 67 69 71 73 75 77 79 noqa
starts = {'FREQ': 0,
'ERR': 14,
'LGINT': 22,
'DR': 30,
'ELO': 32,
'GUP': 42,
'TAG': 45,
'QNFMT': 52,
'Q1': 56,
'Q2': 58,
'Q3': 60,
'Q4': 62,
'Q5': 64,
'Q6': 66,
'Q7': 68,
'Q8': 70,
'Q9': 72,
'Q10': 74,
'Q11': 76,
'Q12': 78,
'Q13': 80,
'Q14': 82,
}

result = ascii.read(text, header_start=None, data_start=0,
comment=r'THIS|^\s{12,14}\d{4,6}.*',
names=list(starts.keys()),
col_starts=list(starts.values()),
format='fixed_width', fast_reader=False)

# int truncates - which is what we want
result['MOLWT'] = [int(x/1e4) for x in result['TAG']]

result['FREQ'].unit = u.MHz
result['ERR'].unit = u.MHz

result['Lab'] = result['MOLWT'] < 0
result['MOLWT'] = np.abs(result['MOLWT'])
result['MOLWT'].unit = u.Da

fix_keys = ['GUP']
for suf in '':
for qn in (f'Q{ii}' for ii in range(1, 15)):
qnind = qn+suf
fix_keys.append(qnind)
for key in fix_keys:
if not np.issubdtype(result[key].dtype, np.integer):
intcol = np.array(list(map(parse_letternumber, result[key])),
dtype=int)
result[key] = intcol

result['LGINT'].unit = u.nm**2 * u.MHz
result['ELO'].unit = u.cm**(-1)

return result


Expand Down Expand Up @@ -375,10 +501,13 @@ def find(self, st, flags):

Returns
-------
The list of values corresponding to the matches
The dictionary containing only values whose keys match the regex

"""

if st in self:
return {st: self[st]}

out = {}

for kk, vv in self.items():
Expand All @@ -394,24 +523,89 @@ def find(self, st, flags):
def build_lookup():

result = CDMS.get_species_table()

# start with the 'molecule' column
keys = list(result['molecule'][:]) # convert NAME column to list
values = list(result['tag'][:]) # convert TAG column to list
dictionary = dict(zip(keys, values)) # make k,v dictionary

# repeat with the Name column
keys = list(result['Name'][:])
values = list(result['tag'][:])
dictionary2 = dict(zip(keys, values))
dictionary.update(dictionary2)

lookuptable = Lookuptable(dictionary) # apply the class above

return lookuptable


def retrieve_catfile(url='https://cdms.astro.uni-koeln.de/classic/entries/partition_function.html'):
def retrieve_catfile(url=f'{conf.classic_server}/entries/partition_function.html'):
"""
Simple retrieve index function
"""
response = requests.get(url)
response.raise_for_status()
tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
'lg(Q(2.725))'],
col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
format='fixed_width', delimiter=' ')
lines = response.text.split("\n")

# used to convert '---' to nan
def tryfloat(x):
try:
return float(x)
except ValueError:
return np.nan

# the 'fixed width' table reader fails because there are rows that violate fixed width
tbl_rows = []
for row in lines[15:-5]:
split = row.split()
tag = int(split[0])
molecule_and_lines = row[7:41]
molecule = " ".join(molecule_and_lines.split()[:-1])
nlines = int(molecule_and_lines.split()[-1])
partfunc = map(tryfloat, row[41:].split())
partfunc_dict = dict(zip(['lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))',
'lg(Q(9.375))', 'lg(Q(5.000))', 'lg(Q(2.725))'], partfunc))
tbl_rows.append({'tag': tag,
'molecule': molecule,
'#lines': nlines,
})
tbl_rows[-1].update(partfunc_dict)
tbl = table.Table(tbl_rows)
# tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
# names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
# 'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
# 'lg(Q(2.725))'],
# col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
# format='fixed_width', delimiter=' ')
return tbl


def retrieve_catfile2(url=f'{conf.classic_server}/predictions/catalog/catdir.html'):
"""
Simple retrieve index function
"""
response = requests.get(url)
response.raise_for_status()
try:
tbl = ascii.read(response.text, format='html')
except UnicodeDecodeError:
# based on https://github.com/astropy/astropy/issues/3826#issuecomment-256113937
keflavich marked this conversation as resolved.
Show resolved Hide resolved
# which suggests to start with the bytecode content and decode with 'replace errors'
text = response.content.decode('ascii', errors='replace')
tbl = ascii.read(text, format='html')

# delete a junk column (wastes space)
del tbl['Catalog']

# for joining - want same capitalization
tbl.rename_column("Tag", "tag")

# one of these is a unicode dash, the other is a normal dash.... in theory
if 'Entry in cm–1' in tbl.colnames:
tbl.rename_column('Entry in cm–1', 'Entry')
if 'Entry in cm-1' in tbl.colnames:
tbl.rename_column('Entry in cm-1', 'Entry')

return tbl
Loading
Loading