methods.tex

% -*- mode: latex; mode: visual-line; fill-column: 9999; coding: utf-8 -*-

\section{Methods}
\label{sec:methods}


In the following we define the quantities and approach used for our performance measurements, with a full summary of all definitions in Table~\ref{tab:notation}.
We evaluated MPI performance of the parallel RMSD time series algorithm~\ref{alg:RMSD} and its variation (algorithm \ref{alg:RMSDhdf5}) by timing the total time to solution as well as the execution time for different parts of the code for individual MPI ranks with the help of the Python \texttt{time.time()} function.


\begin{table}[!htb]
  \centering
  \caption[Summary of measured timing quantities.]  {Summary of measured timing quantities.  Timings are collected for the specified line numbers in the code, labeled as $t_{\text{L$n$}}$ where $\text{L$n$}$ refers to the line number in the corresponding algorithm (columns Algorithm~\ref{alg:RMSD} and \ref{alg:RMSDhdf5}), or are calculated in the same way for both algorithms from the specific quantities.  Variables in the top part of the table refer to measurements of an individual MPI rank.  Variables in the bottom part are aggregates such as averages over all ranks or the total time to solution.}
  \label{tab:notation}
\begin{threeparttable}
  \begin{tabular}{cccp{0.35\textwidth}}
    \toprule
    \bfseries\thead{Quantity} & \multicolumn{2}{c}{\bfseries\thead{Definition}} & \bfseries\thead{Description}\\
                              & Algorithm~\ref{alg:RMSD} & Algorithm~\ref{alg:RMSDhdf5} & \\
    \midrule  
    $t_{\text{opening\_trajectory}}$ &  $t_{\text{L2}}+t_{\text{L3}}$ & ---\textsuperscript{a} & file opening and data structure initialization\\
    $\tIO^{\text{frame}}$   & $t_{\text{L4}}$ & $t_{\text{L2}}$ & data reading per frame\\  
    $\tcomp^{\text{frame}}$ & $t_{\text{L5}}$ & $t_{\text{L3}}$ & compute per frame\\  
    $t_{\text{all\_frame}}$ & $t_{\text{L4}}+t_{\text{L5}}+t_{\text{L6}}$ & $t_{\text{L2}}+t_{\text{L3}}+t_{\text{L4}}$  & time to analyze one frame\\
    $t_{\text{RMSD}}$ &  $t_{\text{L1}} + ...+ t_{\text{L8}}$ & $t_{\text{L1}} + ...+ t_{\text{L6}}$ & time to execute the body of the \texttt{block\_rmsd()} function\\
    $t_{\text{end\_loop}}$ & $t_{\text{L6}} $  & $t_{\text{L4}} $ & closing of the trajectory at the end of the loop\\
    $\tcomm$  & $t_{\text{L16}}$ &  $t_{\text{L15}}$ & data communication with \texttt{MPI\_Gather()}\\
    \cmidrule(l){2-3}
    $N_{\text{frames}}^{\text{total}}$ & & & total number of trajectory frames\\
    $N$ & & & total number of MPI ranks (processes), equals the number of trajectory blocks\\
    $N_{\text{b}}$ & \multicolumn{2}{c}{$N_{\text{frames}}^{\text{total}}/N$} & number of frames per block\\
    $\tcomp$ & \multicolumn{2}{c}{$\sum_{\text{frame}=1}^{N_{\text{b}}}\tcomp^{\text{frame}}$} & total compute time for a rank (process)\\
    $\tIO$ & \multicolumn{2}{c}{$\sum_{\text{frame}=1}^{N_{\text{b}}}\tIO^{\text{frame}}$} & total read I/O time for a rank (process)\\  
    $t_{\text{Overhead1}}$ & \multicolumn{2}{c}{$t_{\text{all\_frame}}-t_{\text{I/O}}-t_{\text{comp}}-t_{\text{end\_loop}}$}  & time inside \texttt{block\_rmsd()} that was not measured explicitly\\
    $t_{\text{Overhead2}}$ & \multicolumn{2}{c}{$t_{\text{RMSD}}-t_{\text{all\_frame}}-t_{\text{opening\_trajectory}}$} & overhead for the \texttt{block\_rmsd()} function call\\
    $t_{N}$ & \multicolumn{2}{c}{$t_{\text{RMSD}}+\tcomm$} & total time to completion for a rank (process)\\
    \toprule
    $\overline{\tcomp}$ & \multicolumn{2}{c}{$\frac{1}{N}\sum_{\text{rank}=1}^{N} \tcomp$} & average compute time over all ranks\\
    $\overline{\tIO}$ & \multicolumn{2}{c}{$\frac{1}{N}\sum_{\text{rank}=1}^{N} \tIO$} & average read I/O time over all ranks\\
    $\overline{\tcomm}$ & \multicolumn{2}{c}{$\frac{1}{N}\sum_{\text{rank}=1}^{N} \tcomm$} & average communication time over all ranks\\
    $t_{\text{total}}$ & \multicolumn{2}{c}{$\max t_{N}$} & total time to solution\\
    \bottomrule
  \end{tabular}
    \begin{tablenotes}[para,flushleft]
    \item [a] Algorithm~\ref{alg:RMSDhdf5} does not need to open a trajectory inside the \texttt{block\_rmsd()} function and hence $t_{\text{opening\_trajectory}}$ only measures time to allocate empty arrays, which is not explicitly shown in Algorithm~\ref{alg:RMSDhdf5}.
      \end{tablenotes}
\end{threeparttable}
\end{table}


\subsection{Timing Observables}

We abbreviate the timings in the following as variables $t_{\text{L$n$}}$ where $\text{L$n$}$ refers to the line number in algorithm~\ref{alg:RMSD} (or algorithm \ref{alg:RMSDhdf5}, see Table~\ref{tab:notation}).
In the function \texttt{block\_rmsd()}, we measured the \emph{read I/O time} for ingesting the data of one trajectory frame from the file system into memory, $t_{\text{I/O}}^{\text{frame}} = t_{\text{L4}}$, and the \emph{compute time} per trajectory frame to perform the computation, $\tcomp^{\text{frame}} = t_{\text{L5}}$.
The \emph{total read I/O time for a MPI rank},  $\tIO = \sum_{\text{frame}=1}^{N_{\text{b}}} \tIO^{\text{frame}}$, is the sum over all I/O times for all the $N_{\text{frames}}$ frames assigned to the rank; similarly, the \emph{total compute time for a MPI rank} is $\tcomp = \sum_{\text{frame}=1}^{N_{\text{b}}} \tcomp^{\text{frame}}$. 
The time delay between the end of the last iteration and exiting the \texttt{for} loop is $t_{\text{end\_loop}} = t_{\text{L6}}$.
The time $t_{\text{opening\_trajectory}} = t_{\text{L2}}+t_{\text{L3}}$ measures the problem setup, which includes data structure initialization and opening of topology and trajectory files.
The \emph{communication time}, $\tcomm = t_{\text{L16}}$, is the time to gather all data from all processor ranks to rank zero.
The total time (for all frames) spent in \texttt{block\_rmsd()} is $t_{\text{RMSD}} = \sum_{i=1}^{8}t_{\text{L$i$}}$. 
There are parts of the code in \texttt{block\_rmsd()} that are not covered by the detailed timing information of \tcomp and \tIO.
Unaccounted time is considered as \emph{overhead}.
We define $t_{\text{Overhead1}}$ and $t_{\text{Overhead2}}$ as the overheads of the calculations (see Table \ref{tab:notation} for the definitions); both are expected to be negligible, which was the case in all our measurements. 
Finally, the \emph{total time to completion of a single MPI rank}, when utilizing $N$ cores for the execution of the overall experiment, is $t_{N}$, and as a result $t_{\text{RMSD}} + \tcomm \equiv t_{N}$.

\subsection{Performance Parameters}

We measured the \emph{total time to solution} $t_{\text{total}}(N)$ with $N$ MPI processes on $N$ cores, which is effectively
$t_{\text{total}}(N) \approx \max(t_{N})$. 
Strong scaling was quantified by the speed-up
\begin{equation}
  \label{eq:speedup}
  S(N) = \frac{t_{\text{total}}(1)}{t_{\text{total}}(N)},
\end{equation}
relative to performance on a single core ($t_{\text{total}}(1)$), and the efficiency
\begin{equation}
  \label{eq:efficiency}
  E(N) = \frac{S(N)}{N}.
\end{equation}
Averages over ranks were calculated as
\begin{equation}
  \label{eq:avg-tcomp}
  \overline{\tcomp} = \frac{1}{N}
  \sum_{\text{rank}=1}^{N}\tcomp = \frac{1}{N}\sum_{\text{rank}=1}^{N}\sum_{\text{frame}=1}^{N_\text{b}}\tcomp^{\text{frame}},
\end{equation}
\begin{equation}
  \label{eq:avg-tIO}
  \overline{\tIO} = \frac{1}{N}\sum_{\text{rank}=1}^{N}\tIO = \frac{1}{N}\sum_{\text{rank}=1}^{N}\sum_{\text{frame}=1}^{N_{\text{b}}}\tIO^{\text{frame}},
\end{equation}
and
\begin{equation}
  \label{eq:avg-tcomm}
  \overline{\tcomm} = \frac{1}{N}\sum_{\text{rank}=1}^{N}\tcomm.
\end{equation}

Additionally, we introduced two performance parameters that we found to be indicative of the occurrence of stragglers.
We defined the ratio of compute time to read I/O time for the serial code as
\begin{equation}
  \label{eq:Compute-IO}  
  \RcompIO = \frac{\tcomp}{\tIO} = %
  \frac{\tcomp/N_{\text{frames}}^{\text{total}}}{\tIO/N_{\text{frames}}^{\text{total}}}  = %
  \frac{\overline{\tcomp^{\text{frame}}}}{\overline{\tIO^{\text{frame}}}}  
\end{equation}
where the last equality shows that the ratio can also be computed from the average times per frame, $\overline{\tcomp^{\text{frame}}}$ and $\overline{\tIO^{\text{frame}}}$.
\RcompIO was calculated with the serial versions of our algorithms (on a single CPU core) in order to characterize the computational problem in the absence of parallelization.
The ratio of compute to communication time was defined by the ratio of average total compute time to the average total communication time   
\begin{equation}
  \label{eq:Compute-comm}
  \Rcompcomm = \frac{\overline{\tcomp}}{\overline{\tcomm}}.
\end{equation}
Because \tcomm cannot be measured for a serial code, we estimated \Rcompcomm from the rank-averages (Eqs.~\ref{eq:avg-tcomp} and \ref{eq:avg-tcomm}) for a given number of MPI ranks.


\subsection{Data sharing}
\label{sec:sharing}

Documentation and benchmark/trajectory conversion scripts are made available in the repository \url{https://github.com/hpcanalytics/supplement-hpc-py-parallel-mdanalysis} under the GNU General Public License v3.0 (code) and the Creative Commons Attribution-ShareAlike (documentation).
All materials are archived under DOI \href{https://doi.org/10.5281/zenodo.3351616}{10.5281/zenodo.3351616}.
These materials should enable users to recreate the computational environment on the tested XSEDE HPC resources (\emph{SDSC Comet}, \emph{PSC Bridges}, \emph{LSU SuperMIC}), prepare data files, and run the computational experiments.