Skip to content

Commit

Permalink
Merge pull request #78 from CoffeaTeam/topic_parsl
Browse files Browse the repository at this point in the history
Parsl executor and corresponding tests with simple config
  • Loading branch information
lgray authored May 17, 2019
2 parents e3a402c + cc1da9f commit d77f77e
Show file tree
Hide file tree
Showing 18 changed files with 292 additions and 188 deletions.
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
include README.rst
include LICENSE
include fnal_column_analysis_tools/processor/templates/*.tmpl
1 change: 0 additions & 1 deletion fnal_column_analysis_tools/processor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from .executor import (
iterative_executor,
futures_executor,
condor_executor,
run_uproot_job,
run_parsl_job,
run_spark_job
Expand Down
4 changes: 2 additions & 2 deletions fnal_column_analysis_tools/processor/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ class LazyDataFrame(MutableMapping):
Simple delayed uproot reader (a la lazyarrays)
Keeps track of values accessed, for later parsing.
"""
def __init__(self, tree, stride=None, index=None, preload_items=None):
def __init__(self, tree, stride=None, index=None, preload_items=None, flatten=False):
self._tree = tree
self._branchargs = {'awkwardlib': awkward}
self._branchargs = {'awkwardlib': awkward, 'flatten': flatten}
self._stride = None
if (stride is not None) and (index is not None):
self._stride = stride
Expand Down
41 changes: 24 additions & 17 deletions fnal_column_analysis_tools/processor/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from .accumulator import accumulator

try:
from collections.abc import Mapping
from collections.abc import Mapping, Sequence
from functools import lru_cache
except ImportError:
from collections import Mapping
from collections import Mapping, Sequence

def lru_cache(maxsize):
def null_wrapper(f):
Expand Down Expand Up @@ -60,14 +60,6 @@ def futures_executor(items, function, accumulator, workers=2, status=True, unit=
return accumulator


def condor_executor(items, function, accumulator, workers, status=True, unit='items', desc='Processing'):
raise NotImplementedError


def spark_executor(items, function, accumulator, config, status=True, unit='datasets', desc='Processing'):
raise NotImplementedError


def _work_function(item):
dataset, fn, treename, chunksize, index, processor_instance = item
file = uproot.open(fn)
Expand Down Expand Up @@ -122,7 +114,7 @@ def run_uproot_job(fileset, treename, processor_instance, executor, executor_arg
return output


def run_parsl_job(fileset, treename, processor_instance, executor, executor_args={'config': None}, chunksize=500000):
def run_parsl_job(fileset, treename, processor_instance, executor, data_flow=None, executor_args={'config': None}, chunksize=500000):
'''
A convenience wrapper to submit jobs for a file, which is a
dictionary of dataset: [file list] entries. In this case using parsl.
Expand All @@ -149,8 +141,8 @@ def run_parsl_job(fileset, treename, processor_instance, executor, executor_args

print('parsl version:', parsl.__version__)

from .parsl.detail import _parsl_work_function, _parsl_get_chunking
from .parsl.parsl_base_executor import ParslBaseExecutor
from .parsl.parsl_executor import ParslExecutor
from .parsl.detail import _parsl_initialize, _parsl_stop, _parsl_get_chunking

if executor_args['config'] is None:
executor_args.pop('config')
Expand All @@ -159,17 +151,32 @@ def run_parsl_job(fileset, treename, processor_instance, executor, executor_args
raise ValueError("Expected fileset to be a mapping dataset: list(files)")
if not isinstance(processor_instance, ProcessorABC):
raise ValueError("Expected processor_instance to derive from ProcessorABC")
if not isinstance(executor, ParslBaseExecutor):
if not isinstance(executor, ParslExecutor):
raise ValueError("Expected executor to derive from ParslBaseExecutor")

# initialize spark if we need to
# if we initialize, then we deconstruct
# when we're done
killParsl = False
if data_flow is None:
data_flow = _parsl_initialize(**executor_args)
killParsl = True
else:
if not isinstance(data_flow, parsl.dataflow.dflow.DataFlowKernel):
raise ValueError("Expected 'data_flow' to be a parsl.dataflow.dflow.DataFlowKernel")

items = []
for dataset, filelist in tqdm(fileset.items(), desc='Preprocessing'):
for chunk in _parsl_get_chunking(tuple(filelist), treename, chunksize):
items.append((dataset, chunk[0], treename, chunk[1], chunk[2], processor_instance))
items.append((dataset, chunk[0], treename, chunk[1], chunk[2]))

output = processor_instance.accumulator.identity()
executor(items, _parsl_work_function, output, **executor_args)
executor(data_flow, items, processor_instance, output, **executor_args)
processor_instance.postprocess(output)

if killParsl:
_parsl_stop(data_flow)

return output


Expand Down Expand Up @@ -230,7 +237,7 @@ def run_spark_job(fileset, processor_instance, executor, executor_args={'config'
if not isinstance(spark, pyspark.sql.session.SparkSession):
raise ValueError("Expected 'spark' to be a pyspark.sql.session.SparkSession")

dfslist = _spark_make_dfs(spark, fileset, partitionsize, thread_workers)
dfslist = _spark_make_dfs(spark, fileset, partitionsize, processor_instance.columns, thread_workers)

output = processor_instance.accumulator.identity()
executor(spark, dfslist, processor_instance, output, thread_workers)
Expand Down
23 changes: 0 additions & 23 deletions fnal_column_analysis_tools/processor/parsl/condor_executor.py

This file was deleted.

25 changes: 12 additions & 13 deletions fnal_column_analysis_tools/processor/parsl/detail.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from concurrent.futures import as_completed
import multiprocessing

import parsl

from parsl.app.app import python_app

from parsl.providers import LocalProvider
from parsl.channels import LocalChannel
from parsl.config import Config
from parsl.executors import HighThroughputExecutor
from parsl.addresses import address_by_hostname

try:
from functools import lru_cache
Expand All @@ -17,33 +19,30 @@ def null_wrapper(f):

return null_wrapper


default_cfg = Config(
_default_cfg = Config(
executors=[
HighThroughputExecutor(
label="coffea_parsl_default",
address=address_by_hostname(),
prefetch_capacity=0,
worker_debug=True,
cores_per_worker=1,
max_workers=1,
# max_blocks=200,
# workers_per_node=1,
worker_logdir_root='./',
provider=LocalProvider(
channel=LocalChannel(),
init_blocks=1,
max_blocks=1,
nodes_per_block=1
),
)
],
strategy=None,
)


def _parsl_work_function():
raise NotImplementedError
def _parsl_initialize(config=_default_cfg):
dfk = parsl.load(config)
return dfk


def _parsl_stop(dfk):
dfk.cleanup()
parsl.clear()


@python_app
Expand Down
16 changes: 0 additions & 16 deletions fnal_column_analysis_tools/processor/parsl/parsl_base_executor.py

This file was deleted.

70 changes: 70 additions & 0 deletions fnal_column_analysis_tools/processor/parsl/parsl_executor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from fnal_column_analysis_tools import hist, processor
from copy import deepcopy
from concurrent.futures import as_completed

from tqdm import tqdm
import cloudpickle as cpkl
import lz4.frame as lz4f
import numpy as np
import pandas as pd

from parsl.app.app import python_app

lz4_clevel = 1


@python_app
def coffea_pyapp(dataset, fn, treename, chunksize, index, procstr):
import uproot
import cloudpickle as cpkl
import lz4.frame as lz4f
from fnal_column_analysis_tools import hist, processor
from fnal_column_analysis_tools.processor.accumulator import accumulator

lz4_clevel = 1

# instrument xrootd source
if not hasattr(uproot.source.xrootd.XRootDSource, '_read_real'):

def _read(self, chunkindex):
self.bytesread = getattr(self, 'bytesread', 0) + self._chunkbytes
return self._read_real(chunkindex)

uproot.source.xrootd.XRootDSource._read_real = uproot.source.xrootd.XRootDSource._read
uproot.source.xrootd.XRootDSource._read = _read

processor_instance = cpkl.loads(lz4f.decompress(procstr))

file = uproot.open(fn)
tree = file[treename]

df = processor.LazyDataFrame(tree, chunksize, index, flatten=True)
df['dataset'] = dataset

vals = processor_instance.process(df)
vals['_bytesread'] = accumulator(file.source.bytesread if isinstance(file.source, uproot.source.xrootd.XRootDSource) else 0)
valsblob = lz4f.compress(cpkl.dumps(vals), compression_level=lz4_clevel)

return valsblob


class ParslExecutor(object):

def __init__(self):
pass

def __call__(self, dfk, items, processor_instance, output, unit='items', desc='Processing'):
procstr = lz4f.compress(cpkl.dumps(processor_instance))

nitems = len(items)
ftr_to_item = set()
for dataset, fn, treename, chunksize, index in items:
ftr_to_item.add(coffea_pyapp(dataset, fn, treename, chunksize, index, procstr))

for ftr in tqdm(as_completed(ftr_to_item), total=nitems, unit='items', desc='Processing'):
blob = ftr.result()
ftrhist = cpkl.loads(lz4f.decompress(blob))
output.add(ftrhist)


parsl_executor = ParslExecutor()
23 changes: 0 additions & 23 deletions fnal_column_analysis_tools/processor/parsl/slurm_executor.py

This file was deleted.

22 changes: 16 additions & 6 deletions fnal_column_analysis_tools/processor/spark/detail.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def _spark_initialize(config=_default_config, **kwargs):
spark_progress = kwargs['spark_progress']

cfg_actual = config
# get spark to not complain about missing log configs
cfg_actual = cfg_actual.config('spark.driver.extraJavaOptions', '-Dlog4jspark.root.logger=ERROR,console')
if not spark_progress:
cfg_actual = cfg_actual.config('spark.ui.showConsoleProgress', 'false')

Expand All @@ -36,22 +38,30 @@ def _spark_initialize(config=_default_config, **kwargs):
def _read_df(spark, files_or_dirs):
if not isinstance(files_or_dirs, Sequence):
raise ValueError("spark dataset file list must be a Sequence (like list())")
return spark.read.parquet(*files_or_dirs)
df = spark.read.parquet(*files_or_dirs)
count = df.count()
return df, count


def _spark_make_dfs(spark, fileset, partitionsize, thread_workers):
def _spark_make_dfs(spark, fileset, partitionsize, columns, thread_workers):
dfs = {}
ana_cols = set(columns)
with ThreadPoolExecutor(max_workers=thread_workers) as executor:
future_to_ds = {executor.submit(_read_df, spark, fileset[dataset]): dataset for dataset in fileset.keys()}
for ftr in tqdm(as_completed(future_to_ds), total=len(fileset), desc='loading', unit='datasets'):
dataset = future_to_ds[ftr]
df = ftr.result()
df, count = ftr.result()
df_cols = set(df.columns)
cols_in_df = ana_cols.intersection(df_cols)
df = df.select(*cols_in_df)
missing_cols = ana_cols - cols_in_df
for missing in missing_cols:
df = df.withColumn(missing, fn.lit(0.0))
df = df.withColumn('dataset', fn.lit(dataset))
count = df.count()
npartitions = max(count // partitionsize, 1)
if df.rdd.getNumPartitions() > npartitions:
df = df.coalesce(npartitions)
dfs[dataset] = df
df = df.repartition(npartitions)
dfs[dataset] = (df, count)
return dfs


Expand Down
Loading

0 comments on commit d77f77e

Please sign in to comment.