Skip to content
This repository has been archived by the owner on Jan 27, 2023. It is now read-only.

Commit

Permalink
Merge pull request #5 from scikit-hep/roothist-to-pandas
Browse files Browse the repository at this point in the history
Roothist to pandas
  • Loading branch information
jpivarski authored Sep 26, 2018
2 parents 4bba49c + da109f7 commit 7258071
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 17 deletions.
132 changes: 115 additions & 17 deletions uproot_methods/classes/TH1.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,10 +195,9 @@ def numpy(self):
edges = numpy.linspace(self._fXaxis._fXmin, self._fXaxis._fXmax, self._fXaxis._fNbins + 1)
return freq, edges

def pandas(self, underflow=True, overflow=True):
def pandas(self, underflow=True, overflow=True, variance=True):
import pandas
freq = numpy.array(self.allvalues, dtype=self._dtype.newbyteorder("="))
print("freq", len(freq))

if not underflow and not overflow:
freq = freq[1:-1]
Expand All @@ -223,25 +222,34 @@ def pandas(self, underflow=True, overflow=True):
elif not overflow:
edges = edges[:-1]

if getattr(self, "_fTitle", b"") == b"":
name = None
else:
name = self._fTitle.decode("utf-8", "ignore")

lefts, rights = edges[:-1], edges[1:]

nonzero = (freq != 0.0)
index = pandas.IntervalIndex.from_arrays(lefts[nonzero], rights[nonzero], closed="left")
index = pandas.IntervalIndex.from_arrays(lefts[nonzero], rights[nonzero], closed="left", name=name)

data = {"count": freq[nonzero]}
if getattr(self, "_fSumw2", None):
sumw2 = self._fSumw2
if not underflow and not overflow:
sumw2 = sumw2[1:-1]
elif not underflow:
sumw2 = sumw2[1:]
elif not overflow:
sumw2 = sumw2[:-1]
data["variance"] = numpy.array(sumw2)[nonzero]
else:
data["variance"] = data["count"]
columns = ["count"]

if variance:
if getattr(self, "_fSumw2", None):
sumw2 = self._fSumw2
if not underflow and not overflow:
sumw2 = sumw2[1:-1]
elif not underflow:
sumw2 = sumw2[1:]
elif not overflow:
sumw2 = sumw2[:-1]
data["variance"] = numpy.array(sumw2)[nonzero]
else:
data["variance"] = data["count"]
columns.append("variance")

return pandas.DataFrame(index=index, data=data, columns=["count", "variance"])
return pandas.DataFrame(index=index, data=data, columns=columns)

def physt(self):
import physt.binnings
Expand Down Expand Up @@ -323,13 +331,13 @@ class TH1(Methods, list):
pass

class TAxis(object):
def __init__(self, fNbins, fXmin, fXmax, fXbins):
def __init__(self, fNbins, fXmin, fXmax):
self._fNbins = fNbins
self._fXmin = fXmin
self._fXmax = fXmax

out = TH1.__new__(TH1)
out._fXaxis = TAxis(len(edges) - 1, edges[0], edges[-1], None)
out._fXaxis = TAxis(len(edges) - 1, edges[0], edges[-1])
if not numpy.array_equal(edges, numpy.linspace(edges[0], edges[-1], len(edges), dtype=edges.dtype)):
out._fXaxis._fXbins = edges.astype(">f8")

Expand All @@ -354,6 +362,96 @@ def __init__(self, fNbins, fXmin, fXmax, fXbins):

return out

def from_pandas(histogram):
import pandas

histogram = histogram.sort_index(ascending=True, inplace=False)
if not histogram.index.is_non_overlapping_monotonic:
raise ValueError("intervals overlap; cannot form a histogram")

sparse = histogram.index[numpy.isfinite(histogram.index.left) & numpy.isfinite(histogram.index.right)]
if (sparse.right[:-1] == sparse.left[1:]).all():
dense = sparse
else:
pairs = numpy.empty(len(sparse) * 2, dtype=numpy.float64)
pairs[::2] = sparse.left
pairs[1::2] = sparse.right
nonempty = numpy.empty(len(pairs), dtype=numpy.bool_)
nonempty[:-1] = (pairs[1:] != pairs[:-1])
nonempty[-1] = True
dense = pandas.IntervalIndex.from_breaks(pairs[nonempty], closed="left")

densehist = pandas.DataFrame(index=dense.left).join(histogram.reindex(histogram.index.left))
densehist.fillna(0, inplace=True)

underflowhist = histogram[numpy.isinf(histogram.index.left)]
overflowhist = histogram[numpy.isinf(histogram.index.right)]

content = numpy.array(densehist["count"])

sumw2 = numpy.empty(len(content) + 2, dtype=numpy.float64)
if "variance" in densehist.columns:
sumw2source = "variance"
else:
sumw2source = "count"
sumw2[1:-1] = densehist[sumw2source]
if len(underflowhist) == 0:
sumw2[0] = 0
else:
sumw2[0] = underflowhist[sumw2source]
if len(overflowhist) == 0:
sumw2[-1] = 0
else:
sumw2[-1] = overflowhist[sumw2source]

edges = numpy.empty(len(densehist) + 1, dtype=numpy.float64)
edges[:-1] = dense.left
edges[-1] = dense.right[-1]

class TH1(Methods, list):
pass

class TAxis(object):
def __init__(self, fNbins, fXmin, fXmax):
self._fNbins = fNbins
self._fXmin = fXmin
self._fXmax = fXmax

out = TH1.__new__(TH1)
out._fXaxis = TAxis(len(edges) - 1, edges[0], edges[-1])
out._fXaxis._fXbins = edges

centers = (edges[:-1] + edges[1:]) / 2.0
out._fEntries = content.sum()
out._fTsumw = content.sum()
out._fTsumw2 = sumw2.sum()
out._fTsumwx = (content * centers).sum()
out._fTsumwx2 = (content * centers**2).sum()

if histogram.index.name is None:
out._fTitle = b""
elif isinstance(histogram.index.name, bytes):
out._fTitle = histogram.index.name
else:
out._fTitle = histogram.index.name.encode("utf-8", "ignore")

out._classname, content = _histtype(content)

valuesarray = numpy.empty(len(content) + 2, dtype=content.dtype)
valuesarray[1:-1] = content
if len(underflowhist) == 0:
valuesarray[0] = 0
else:
valuesarray[0] = underflowhist["count"]
if len(overflowhist) == 0:
valuesarray[-1] = 0
else:
valuesarray[-1] = overflowhist["count"]

out.extend(valuesarray)

return out

def from_physt(histogram):
import physt.binnings
import physt.histogram1d
Expand Down
5 changes: 5 additions & 0 deletions uproot_methods/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def resolve(obj):
def types(cls, obj):
if cls is numpy.ndarray:
yield ("numpy", "ndarray", len(obj.shape), str(obj.dtype))
elif cls.__module__ == "pandas.core.frame" and cls.__name__ == "DataFrame":
yield ("pandas.core.frame", "DataFrame", obj.index.__class__.__name__, set(obj.columns))
else:
yield (cls.__module__, cls.__name__)
for x in cls.__bases__:
Expand All @@ -52,6 +54,9 @@ def types(cls, obj):
elif isinstance(obj, tuple) and any(x[:2] == ("numpy", "ndarray") for x in types(obj[0].__class__, obj[0])) and any(x[:2] == ("numpy", "ndarray") for x in types(obj[1].__class__, obj[1])) and len(obj[0]) + 1 == len(obj[1]):
return ("uproot_methods.classes.TH1", "from_numpy", "uproot.write.objects.TH1", "TH1")

elif any(x[:3] == ("pandas.core.frame", "DataFrame", "IntervalIndex") and "count" in x[3] for x in types(obj.__class__, obj)):
return ("uproot_methods.classes.TH1", "from_pandas", "uproot.write.objects.TH1", "TH1")

elif any(x == ("physt.histogram1d", "Histogram1D") for x in types(obj.__class__, obj)):
return ("uproot_methods.classes.TH1", "from_physt", "uproot.write.objects.TH1", "TH1")

Expand Down

0 comments on commit 7258071

Please sign in to comment.