diff --git a/uproot_methods/classes/TH1.py b/uproot_methods/classes/TH1.py index 3f6ba1b..7e9b36f 100644 --- a/uproot_methods/classes/TH1.py +++ b/uproot_methods/classes/TH1.py @@ -195,10 +195,9 @@ def numpy(self): edges = numpy.linspace(self._fXaxis._fXmin, self._fXaxis._fXmax, self._fXaxis._fNbins + 1) return freq, edges - def pandas(self, underflow=True, overflow=True): + def pandas(self, underflow=True, overflow=True, variance=True): import pandas freq = numpy.array(self.allvalues, dtype=self._dtype.newbyteorder("=")) - print("freq", len(freq)) if not underflow and not overflow: freq = freq[1:-1] @@ -223,25 +222,34 @@ def pandas(self, underflow=True, overflow=True): elif not overflow: edges = edges[:-1] + if getattr(self, "_fTitle", b"") == b"": + name = None + else: + name = self._fTitle.decode("utf-8", "ignore") + lefts, rights = edges[:-1], edges[1:] nonzero = (freq != 0.0) - index = pandas.IntervalIndex.from_arrays(lefts[nonzero], rights[nonzero], closed="left") + index = pandas.IntervalIndex.from_arrays(lefts[nonzero], rights[nonzero], closed="left", name=name) data = {"count": freq[nonzero]} - if getattr(self, "_fSumw2", None): - sumw2 = self._fSumw2 - if not underflow and not overflow: - sumw2 = sumw2[1:-1] - elif not underflow: - sumw2 = sumw2[1:] - elif not overflow: - sumw2 = sumw2[:-1] - data["variance"] = numpy.array(sumw2)[nonzero] - else: - data["variance"] = data["count"] + columns = ["count"] + + if variance: + if getattr(self, "_fSumw2", None): + sumw2 = self._fSumw2 + if not underflow and not overflow: + sumw2 = sumw2[1:-1] + elif not underflow: + sumw2 = sumw2[1:] + elif not overflow: + sumw2 = sumw2[:-1] + data["variance"] = numpy.array(sumw2)[nonzero] + else: + data["variance"] = data["count"] + columns.append("variance") - return pandas.DataFrame(index=index, data=data, columns=["count", "variance"]) + return pandas.DataFrame(index=index, data=data, columns=columns) def physt(self): import physt.binnings @@ -323,13 +331,13 @@ class TH1(Methods, list): pass class TAxis(object): - def __init__(self, fNbins, fXmin, fXmax, fXbins): + def __init__(self, fNbins, fXmin, fXmax): self._fNbins = fNbins self._fXmin = fXmin self._fXmax = fXmax out = TH1.__new__(TH1) - out._fXaxis = TAxis(len(edges) - 1, edges[0], edges[-1], None) + out._fXaxis = TAxis(len(edges) - 1, edges[0], edges[-1]) if not numpy.array_equal(edges, numpy.linspace(edges[0], edges[-1], len(edges), dtype=edges.dtype)): out._fXaxis._fXbins = edges.astype(">f8") @@ -354,6 +362,96 @@ def __init__(self, fNbins, fXmin, fXmax, fXbins): return out +def from_pandas(histogram): + import pandas + + histogram = histogram.sort_index(ascending=True, inplace=False) + if not histogram.index.is_non_overlapping_monotonic: + raise ValueError("intervals overlap; cannot form a histogram") + + sparse = histogram.index[numpy.isfinite(histogram.index.left) & numpy.isfinite(histogram.index.right)] + if (sparse.right[:-1] == sparse.left[1:]).all(): + dense = sparse + else: + pairs = numpy.empty(len(sparse) * 2, dtype=numpy.float64) + pairs[::2] = sparse.left + pairs[1::2] = sparse.right + nonempty = numpy.empty(len(pairs), dtype=numpy.bool_) + nonempty[:-1] = (pairs[1:] != pairs[:-1]) + nonempty[-1] = True + dense = pandas.IntervalIndex.from_breaks(pairs[nonempty], closed="left") + + densehist = pandas.DataFrame(index=dense.left).join(histogram.reindex(histogram.index.left)) + densehist.fillna(0, inplace=True) + + underflowhist = histogram[numpy.isinf(histogram.index.left)] + overflowhist = histogram[numpy.isinf(histogram.index.right)] + + content = numpy.array(densehist["count"]) + + sumw2 = numpy.empty(len(content) + 2, dtype=numpy.float64) + if "variance" in densehist.columns: + sumw2source = "variance" + else: + sumw2source = "count" + sumw2[1:-1] = densehist[sumw2source] + if len(underflowhist) == 0: + sumw2[0] = 0 + else: + sumw2[0] = underflowhist[sumw2source] + if len(overflowhist) == 0: + sumw2[-1] = 0 + else: + sumw2[-1] = overflowhist[sumw2source] + + edges = numpy.empty(len(densehist) + 1, dtype=numpy.float64) + edges[:-1] = dense.left + edges[-1] = dense.right[-1] + + class TH1(Methods, list): + pass + + class TAxis(object): + def __init__(self, fNbins, fXmin, fXmax): + self._fNbins = fNbins + self._fXmin = fXmin + self._fXmax = fXmax + + out = TH1.__new__(TH1) + out._fXaxis = TAxis(len(edges) - 1, edges[0], edges[-1]) + out._fXaxis._fXbins = edges + + centers = (edges[:-1] + edges[1:]) / 2.0 + out._fEntries = content.sum() + out._fTsumw = content.sum() + out._fTsumw2 = sumw2.sum() + out._fTsumwx = (content * centers).sum() + out._fTsumwx2 = (content * centers**2).sum() + + if histogram.index.name is None: + out._fTitle = b"" + elif isinstance(histogram.index.name, bytes): + out._fTitle = histogram.index.name + else: + out._fTitle = histogram.index.name.encode("utf-8", "ignore") + + out._classname, content = _histtype(content) + + valuesarray = numpy.empty(len(content) + 2, dtype=content.dtype) + valuesarray[1:-1] = content + if len(underflowhist) == 0: + valuesarray[0] = 0 + else: + valuesarray[0] = underflowhist["count"] + if len(overflowhist) == 0: + valuesarray[-1] = 0 + else: + valuesarray[-1] = overflowhist["count"] + + out.extend(valuesarray) + + return out + def from_physt(histogram): import physt.binnings import physt.histogram1d diff --git a/uproot_methods/convert.py b/uproot_methods/convert.py index 55d0cd2..1e687fd 100644 --- a/uproot_methods/convert.py +++ b/uproot_methods/convert.py @@ -40,6 +40,8 @@ def resolve(obj): def types(cls, obj): if cls is numpy.ndarray: yield ("numpy", "ndarray", len(obj.shape), str(obj.dtype)) + elif cls.__module__ == "pandas.core.frame" and cls.__name__ == "DataFrame": + yield ("pandas.core.frame", "DataFrame", obj.index.__class__.__name__, set(obj.columns)) else: yield (cls.__module__, cls.__name__) for x in cls.__bases__: @@ -52,6 +54,9 @@ def types(cls, obj): elif isinstance(obj, tuple) and any(x[:2] == ("numpy", "ndarray") for x in types(obj[0].__class__, obj[0])) and any(x[:2] == ("numpy", "ndarray") for x in types(obj[1].__class__, obj[1])) and len(obj[0]) + 1 == len(obj[1]): return ("uproot_methods.classes.TH1", "from_numpy", "uproot.write.objects.TH1", "TH1") + elif any(x[:3] == ("pandas.core.frame", "DataFrame", "IntervalIndex") and "count" in x[3] for x in types(obj.__class__, obj)): + return ("uproot_methods.classes.TH1", "from_pandas", "uproot.write.objects.TH1", "TH1") + elif any(x == ("physt.histogram1d", "Histogram1D") for x in types(obj.__class__, obj)): return ("uproot_methods.classes.TH1", "from_physt", "uproot.write.objects.TH1", "TH1")