From a6e4182aee201318738f2caab2ffc45f7db93f3a Mon Sep 17 00:00:00 2001 From: mmpcn Date: Tue, 17 Sep 2024 15:55:04 +0200 Subject: [PATCH 1/5] Added description of species table. --- LineTAP.tex | 49 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/LineTAP.tex b/LineTAP.tex index edb0c92..6e7cbca 100644 --- a/LineTAP.tex +++ b/LineTAP.tex @@ -102,16 +102,21 @@ \section{Introduction} data through a VO service employing such a simplified relational mapping. The resulting table schema is presented in section~\ref{sect:quantities}, while the mapping between our columns and the -VAMDC-XSAMS Data Model is given in section~\ref{sect:mapping}. +VAMDC-XSAMS Data Model is given in section~\ref{sect:mapping}. As the +InChi number for referencing molecules can become complicated for use, +a second table, the \textit{species table}, ist generated from the LineTap table +containing the species InChiNumber, name and formula. So this table can be searched +before querying LineTap to find the corresponding InChi of a certain molecule. In +\ref{sect:speciestable} a description of this table and it's use can be found. When accessed using the Table Access Protocol TAP -\citep{2019ivoa.spec.0927D}, the table can be queried using the +\citep{2019ivoa.spec.0927D}, the tables can be queried using the expressive SQL-derived query language ADQL, while query results are available in the VOTable format, easily readable by VO client applications. Line databases accessible in this way can be registered in the VO Registry. The detailed rules for this registration, and recommendations for how to discover LineTAP services, are given in -section~\ref{sect:regmatters}. +section~\ref{sect:regmatters}. \subsection{Role within the VO Architecture} @@ -379,6 +384,44 @@ \section{Spectral Line Data}\label{sect:quantities} \end{itemize} +\section{Species Table} +\label{ref:speciestable} + +The Species Table, listed in Table~\ref{tab:spcols}, is used to facilitate the +referencing of molecules. As formulas and names can be ambiguous, LineTAP +uses InChi/InChIkey so they are uniquely identified. But it's not quite human readable. +The Species Table contains a mapping between common names and formulas and +InChI numbers. It should be generated by the data providers on all the species +contained in the LineTap tables. It is also a list of all species that are provided by +the LineTap service, so a client can search this table first to find out if this service contains a +certain molecule. + +\begin{table}[hpt] +\hskip -0.05\linewidth +\begin{tabular}{p{0.43\linewidth}cp{0.5\linewidth}} +\sptablerule +\textbf{Name [Unit]} \ucd{UCD}&\textbf{Type}&\textbf{Description}\\ +\sptablerule +% GENERATED: python3 make-columns-table.py +\texttt{inchikey} \hfil\break\ucd{} & text & \raggedright InChIKey of this species\tabularnewline +\rowsep +\texttt{inchi} \hfil\break\ucd{} & text & \raggedright InChI of this species\tabularnewline +\rowsep +\texttt{name} \hfil\break\ucd{} & text & \raggedright A common name of this species\tabularnewline +\rowsep +\texttt{formula} \hfil\break\ucd{} & text & \raggedright Chemical formula of this species\tabularnewline +\rowsep +\texttt{stoichiometricformula} \hfil\break\ucd{} & text & \raggedright Chemical formula of this species\tabularnewline +\rowsep +\texttt{source\_id} \hfil\break\ucd{} & text & \raggedright VAMDC identifier of the origin of this mapping\tabularnewline +% /GENERATED +\sptablerule +\end{tabular} +\caption{The columns that make up the Species Table. } +\label{tab:ltcols} +\end{table} +The columns inchikey, name and formula are indexed to the columns of same name in the LineTap table. + \section{Protocol} \label{sect:protocol} From 4f6ad60971f52ec2f344335871141b05e54078ef Mon Sep 17 00:00:00 2001 From: mmpcn Date: Tue, 17 Sep 2024 15:55:36 +0200 Subject: [PATCH 2/5] script to generate latex species table description --- make-species-table.py | 76 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 make-species-table.py diff --git a/make-species-table.py b/make-species-table.py new file mode 100644 index 0000000..983aa10 --- /dev/null +++ b/make-species-table.py @@ -0,0 +1,76 @@ +#!/usr/bin/python3 +""" +This writes LaTeX for the rows of our table of LineTAP columns. Technically, +this obtains the info from the standard columns of an operational (and +hopefully validated) table at dc.g-vo.org. + +Dependency: python3-pyvo (and hence astropy). +""" + +import pyvo + +NON_NULL_COLUMNS = {'title', 'vacuum_wavelength'} +TYPE_MAP = { + ("char", "*"): "text", + ("unicodeChar", "*"): "text", + ("int", ""): "integer", + ("double", ""): "float",} + + +def e(tx): + """returns tx with TeX's standard active (and other magic) characters + escaped. + """ + return tx.replace("\\", "$\\backslash$" + ).replace("&", "\\&" + ).replace("#", "\\#" + ).replace("%", "\\%" + ).replace("_", "\\_" + ).replace("}", "\\}" + ).replace("{", "\\{" + ).replace('"', '{"}') + + +def get_type(datatype, arraysize, nonnull): + """returns a simple type identifier for a VOTable datatype/arraysize. + + Well, this really only nows what people have manually entered into + TYPE_MAP above... + """ + res = e(TYPE_MAP[datatype, arraysize]) + if nonnull: + res = f"\\textbf{{{res}}}" + return res + + +def main(): + svc = pyvo.tap.TAPService("http://dc.g-vo.org/tap") + rows = [] + + for row in svc.run_sync(""" + select column_name, description, unit, ucd, datatype, arraysize + from tap_schema.columns + where + table_name='species.main' + order by column_index"""): + parts = [r"\texttt{{{}}}".format(e(row["column_name"]))] + if row["unit"]: + parts.append(e("["+row["unit"].replace("Angstrom", "Å")+"]")) + parts.append(r"\hfil\break\ucd{{{}}}".format(e(row["ucd"]))) + + parts.append("&") + parts.append(get_type( + row["datatype"], + row["arraysize"], + row["column_name"] in NON_NULL_COLUMNS)) + + parts.append("&") + parts.append(r"\raggedright "+e(row["description"])) + + rows.append(" ".join(parts)+r"\tabularnewline") + + print("\n\\rowsep\n".join(rows)) + + +if __name__=="__main__": + main() From 724170f1484cec1f2c2339e4f97f26eadff9fa87 Mon Sep 17 00:00:00 2001 From: mmpcn Date: Wed, 18 Sep 2024 14:11:13 +0200 Subject: [PATCH 3/5] added species table --- LineTAP.tex | 15 +++++++++------ Makefile | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/LineTAP.tex b/LineTAP.tex index 6e7cbca..1fa0b19 100644 --- a/LineTAP.tex +++ b/LineTAP.tex @@ -102,12 +102,13 @@ \section{Introduction} data through a VO service employing such a simplified relational mapping. The resulting table schema is presented in section~\ref{sect:quantities}, while the mapping between our columns and the -VAMDC-XSAMS Data Model is given in section~\ref{sect:mapping}. As the -InChi number for referencing molecules can become complicated for use, +VAMDC-XSAMS Data Model is given in section~\ref{sect:mapping}. + +As the InChi number for referencing molecules can become complicated for use, a second table, the \textit{species table}, ist generated from the LineTap table -containing the species InChiNumber, name and formula. So this table can be searched +containing the species InChiNumber, name and formula. This table can be searched before querying LineTap to find the corresponding InChi of a certain molecule. In -\ref{sect:speciestable} a description of this table and it's use can be found. +Section \ref{sect:speciestable} a description of this table and it's use can be found. When accessed using the Table Access Protocol TAP \citep{2019ivoa.spec.0927D}, the tables can be queried using the @@ -240,6 +241,7 @@ \subsection{Non-Use Cases} \end{itemize} + \begin{table}[hpt] \hskip -0.05\linewidth \begin{tabular}{p{0.43\linewidth}cp{0.5\linewidth}} @@ -384,7 +386,7 @@ \section{Spectral Line Data}\label{sect:quantities} \end{itemize} -\section{Species Table} +\section{Species Table}\label{sect:speciestable} \label{ref:speciestable} The Species Table, listed in Table~\ref{tab:spcols}, is used to facilitate the @@ -418,7 +420,7 @@ \section{Species Table} \sptablerule \end{tabular} \caption{The columns that make up the Species Table. } -\label{tab:ltcols} +\label{tab:spcols} \end{table} The columns inchikey, name and formula are indexed to the columns of same name in the LineTap table. @@ -471,6 +473,7 @@ \subsection{Use Case Examples} In this section, we give queries addressing the use cases from section~\ref{sect:use-cases}. +\todo{add example using species table}. \subsubsection{Identifying a Single Line} diff --git a/Makefile b/Makefile index 6cee957..d7b63cb 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ DOCNAME = LineTAP DOCVERSION = 1.0 # Publication date, ISO format; update manually for "releases" -DOCDATE = 2023-03-23 +DOCDATE = 2024-09-18 # What is it you're writing: NOTE, WD, PR, REC, PEN, or EN DOCTYPE = WD From 0436afe6e90e6d4ce5cfda0e75b099794b4e90a3 Mon Sep 17 00:00:00 2001 From: Markus Demleitner Date: Thu, 19 Sep 2024 16:17:36 +0200 Subject: [PATCH 4/5] Review of species list addition. Important: this changes the utype of the lines tables. This also adds registry considerations and a use case. --- LineTAP.tex | 115 +++++++++++++++++++++++++++++++++------------------- linetap.vor | 24 +++++++---- 2 files changed, 89 insertions(+), 50 deletions(-) diff --git a/LineTAP.tex b/LineTAP.tex index 1fa0b19..d641a5a 100644 --- a/LineTAP.tex +++ b/LineTAP.tex @@ -102,13 +102,15 @@ \section{Introduction} data through a VO service employing such a simplified relational mapping. The resulting table schema is presented in section~\ref{sect:quantities}, while the mapping between our columns and the -VAMDC-XSAMS Data Model is given in section~\ref{sect:mapping}. +VAMDC-XSAMS Data Model is given in section~\ref{sect:mapping}. -As the InChi number for referencing molecules can become complicated for use, -a second table, the \textit{species table}, ist generated from the LineTap table -containing the species InChiNumber, name and formula. This table can be searched -before querying LineTap to find the corresponding InChi of a certain molecule. In -Section \ref{sect:speciestable} a description of this table and it's use can be found. +During the development of the standard, a major problem in molecular +spectroscopy turned out to be species nomenclature. The core LineTAP +table sidesteps this problem by identifying species using IUPAC standard +InChIs, a choice unpopular with many practitioners. To facilitate the +use of colloquial species designations (``ethyl alcohol''), this +specification also defines a \textit{species table} associating common +names and sum formulas with InChIs in section \ref{sect:speciestable}. When accessed using the Table Access Protocol TAP \citep{2019ivoa.spec.0927D}, the tables can be queried using the @@ -117,7 +119,7 @@ \section{Introduction} applications. Line databases accessible in this way can be registered in the VO Registry. The detailed rules for this registration, and recommendations for how to discover LineTAP services, are given in -section~\ref{sect:regmatters}. +section~\ref{sect:regmatters}. \subsection{Role within the VO Architecture} @@ -226,6 +228,13 @@ \subsection{Credit} repository of line data, it should be as simple as possible for users to give credit to the contributors of line data. +\subsection{Resolution of Molecule Designation} +\label{uc:resolution} + +A researcher wants to find lines for the molecule they have been calling +``Methyl Mercaptan'' or designated by a pseudo-structural formula like +\verb|CH3SHv=0| for a long time. + \subsection{Non-Use Cases} @@ -389,14 +398,21 @@ \section{Spectral Line Data}\label{sect:quantities} \section{Species Table}\label{sect:speciestable} \label{ref:speciestable} -The Species Table, listed in Table~\ref{tab:spcols}, is used to facilitate the -referencing of molecules. As formulas and names can be ambiguous, LineTAP -uses InChi/InChIkey so they are uniquely identified. But it's not quite human readable. -The Species Table contains a mapping between common names and formulas and -InChI numbers. It should be generated by the data providers on all the species -contained in the LineTap tables. It is also a list of all species that are provided by -the LineTap service, so a client can search this table first to find out if this service contains a -certain molecule. +The species table is used to facilitate the referencing of molecules. As +there are many summary formulas and colloquial molecule names for common +species (and more than one species may correspond to a given summary +formula and even colloquial name), the resolution of such identifiers to +InChIs is generally non-trivial. + +LineTAP's species table contains a mapping between common names and +summary formulas and InChIs. It should be populated by data providers +publishing molecule data to the best of their knowledge. It is +explicitly possible to associate multiple names with a single InChI. +There is no explicit relationship between a species table and LineTAP +tables on a given service, i.e., the presence of a species in the the +species table is not a guarantee that data on it is available from any +table in the service.\todo{Is there a use case for having InChI in here? +I'd say InChIKey is good enough.} \begin{table}[hpt] \hskip -0.05\linewidth @@ -404,25 +420,23 @@ \section{Species Table}\label{sect:speciestable} \sptablerule \textbf{Name [Unit]} \ucd{UCD}&\textbf{Type}&\textbf{Description}\\ \sptablerule -% GENERATED: python3 make-columns-table.py +% GENERATED: python3 make-species-table.py \texttt{inchikey} \hfil\break\ucd{} & text & \raggedright InChIKey of this species\tabularnewline \rowsep \texttt{inchi} \hfil\break\ucd{} & text & \raggedright InChI of this species\tabularnewline \rowsep \texttt{name} \hfil\break\ucd{} & text & \raggedright A common name of this species\tabularnewline \rowsep -\texttt{formula} \hfil\break\ucd{} & text & \raggedright Chemical formula of this species\tabularnewline -\rowsep -\texttt{stoichiometricformula} \hfil\break\ucd{} & text & \raggedright Chemical formula of this species\tabularnewline +\texttt{formula} \hfil\break\ucd{} & text & \raggedright Chemical formula of this species in some free-ish notation\tabularnewline \rowsep \texttt{source\_id} \hfil\break\ucd{} & text & \raggedright VAMDC identifier of the origin of this mapping\tabularnewline + % /GENERATED \sptablerule \end{tabular} \caption{The columns that make up the Species Table. } \label{tab:spcols} \end{table} -The columns inchikey, name and formula are indexed to the columns of same name in the LineTap table. \section{Protocol} @@ -473,7 +487,6 @@ \subsection{Use Case Examples} In this section, we give queries addressing the use cases from section~\ref{sect:use-cases}. -\todo{add example using species table}. \subsubsection{Identifying a Single Line} @@ -587,6 +600,24 @@ \subsubsection{Characterising a Service's Data Holdings} GROUP BY inchi \end{lstlisting} +\subsubsection{Searching With Trivial Molecule Names} + +Searching with trivial names as discussed in use +case~\ref{uc:resolution} would often be a two-step process where clients +ask the researcher which InChI would correspond the the species they +were looking for. In simple cases, however, a single joined query can be +run, too. + +% please-run-a-test +\begin{lstlisting}[language=SQL] +SELECT + * +FROM casa_lines.line_tap +JOIN species.main as s USING (inchikey) +WHERE s.name='Methylidynium' +\end{lstlisting} + + \section{Mapping from VAMDCXSAMS} \label{sect:mapping} @@ -711,16 +742,13 @@ \section{LineTAP and the VO Registry} \subsection{Registering LineTAP-conforming Tables} -LineTAP tables are registered using VODataService \citep{2021ivoa.spec.1102D} +LineTAP line tables are registered using VODataService \citep{2021ivoa.spec.1102D} tablesets, where the table utype is set to -$$\hbox{\verb|ivo://ivoa.net/std/linetap#table-1.0|}.$$ +$$\hbox{\verb|ivo://ivoa.net/std/linetap#lines-1.0|}.$$ -The tableset is normally contained in a VODataService \xmlel{CatalogService} -record with a TAP capability, and this capability normally is an auxiliary -capability as per DDC \citep{2019ivoa.spec.0520D}. For one-table -services a full TAPRegExt \citep{2012ivoa.spec.0827D} capability is also -allowed; other resource types can be used for registration as -appropriate. +The tableset is contained in a VODataService \xmlel{CatalogResource} +record with a TAP auxiliary capability +as per DDC \citep{2019ivoa.spec.0520D}. Further capabilities, for instance for full VAMDC or legacy SLAP services, may be given in the same record. @@ -760,7 +788,7 @@ \subsection{Registering LineTAP-conforming Tables} toss.ivoa_lines TOSS The LineTAP version of... - ivo://ivoa.net/std/linetap#table-1.0 + ivo://ivoa.net/std/linetap#lines-1.0 ... \end{lstlisting} @@ -772,6 +800,12 @@ \subsection{Registering LineTAP-conforming Tables} and is thus to be expected in most registrations of this type. Clients are advised to use the resource description for full text searches. +Species tables are registered in exactly the same way, except their +utype is +$$\hbox{\verb|ivo://ivoa.net/std/linetap#species-1.0|}.$$ +Data providers should only register line and species tables in one +resource record if the species table really has the same metadata +(description, author, source, etc) as the line table. \subsection{Discovering LineTAP services} @@ -784,20 +818,17 @@ \subsection{Discovering LineTAP services} would return TAP access URLs and the table names: \begin{lstlisting}[language=SQL] -SELECT DISTINCT table_name, access_url +SELECT table_name, access_url FROM rr.res_table NATURAL JOIN rr.capability NATURAL JOIN rr.interface WHERE - table_utype LIKE 'ivo://ivoa.net/std/linetap#table-1.%' + table_utype LIKE 'ivo://ivoa.net/std/linetap#lines-1.%' AND standard_id LIKE 'ivo://ivoa.net/std/tap%' AND intf_role='std' + AND res_type='vs:catalogresource' \end{lstlisting} -The \texttt{DISTINCT} in the main query is a rough filter that removes -entries duplicated because their tables are registred both in the main -TAP record and in an auxiliary capability. - The regular expression in the utype match is to make sure minor version increments do not prevent service discovery; by IVOA versioning rules, all LineTAP services of minor version 1 can be operated by all LineTAP @@ -805,14 +836,16 @@ \subsection{Discovering LineTAP services} service. Clients may want to adapt the TAP discovery pattern to match their specific needs. - +Adapting the utype, this query will work analogously for species tables. \appendix -\section{Changes from Previous Versions} +\section{Changes from WD-2023-03-23} -No previous versions yet. -% these would be subsections "Changes from v. WD-..." -% Use itemize environments. +\begin{itemize} +\item Adding the species table +\item Changing the line table utype to \dots lines-1.0 (rather than +\dots table-1.0 before). +\end{itemize} \bibliography{ivoatex/ivoabib,ivoatex/docrepo, localrefs} diff --git a/linetap.vor b/linetap.vor index f31a5dc..43dd39d 100644 --- a/linetap.vor +++ b/linetap.vor @@ -1,11 +1,11 @@ -1.0 - table-1.0 - The LineTAP table schema as of version 1.0. + lines-1.0 + The LineTAP lines table schema as of version 1.0. + + + + + species-1.0 + The LineTAP species table schema as of version 1.0. From 21fcbf7f1663860d314b5ae3eb7de650ba82d5fd Mon Sep 17 00:00:00 2001 From: mmpcn Date: Wed, 25 Sep 2024 11:31:01 +0200 Subject: [PATCH 5/5] changed section names and added information --- LineTAP.tex | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/LineTAP.tex b/LineTAP.tex index d641a5a..547f9d9 100644 --- a/LineTAP.tex +++ b/LineTAP.tex @@ -296,7 +296,7 @@ \subsection{Non-Use Cases} \end{table} -\section{Spectral Line Data}\label{sect:quantities} +\section{Spectral Lines Table}\label{sect:quantities} Table~\ref{tab:ltcols} gives the columns that make up the LineTAP relational model. Implementations MUST have all columns given in this @@ -411,8 +411,11 @@ \section{Species Table}\label{sect:speciestable} There is no explicit relationship between a species table and LineTAP tables on a given service, i.e., the presence of a species in the the species table is not a guarantee that data on it is available from any -table in the service.\todo{Is there a use case for having InChI in here? -I'd say InChIKey is good enough.} +table in the service. + +For most cases, only the InChIKey is enough to reference a molecule. The InChi +column is present in this table for the case that users want to use it to confirm if the +returned molecule is the one they're searching for. \begin{table}[hpt] \hskip -0.05\linewidth @@ -438,12 +441,7 @@ \section{Species Table}\label{sect:speciestable} \label{tab:spcols} \end{table} - -\section{Protocol} -\label{sect:protocol} -\subsection{Queries: LineTAP} - -\subsection{User-defined functions} +\section{ADQL User-defined functions} \label{sect:udfs} LineTAP services MUST implement the \texttt{ivo\_specconv} user defined