Merge pull request #5 from scikit-hep/roothist-to-pandas

Roothist to pandas
scikit-hep · Sep 26, 2018 · 7258071 · 7258071
2 parents 4bba49c + da109f7
commit 7258071
Show file tree

Hide file tree

Showing 2 changed files with 120 additions and 17 deletions.
diff --git a/uproot_methods/classes/TH1.py b/uproot_methods/classes/TH1.py
@@ -195,10 +195,9 @@ def numpy(self):
             edges = numpy.linspace(self._fXaxis._fXmin, self._fXaxis._fXmax, self._fXaxis._fNbins + 1)
         return freq, edges
 
-    def pandas(self, underflow=True, overflow=True):
+    def pandas(self, underflow=True, overflow=True, variance=True):
         import pandas
         freq = numpy.array(self.allvalues, dtype=self._dtype.newbyteorder("="))
-        print("freq", len(freq))
 
         if not underflow and not overflow:
             freq = freq[1:-1]
@@ -223,25 +222,34 @@ def pandas(self, underflow=True, overflow=True):
         elif not overflow:
             edges = edges[:-1]
 
+        if getattr(self, "_fTitle", b"") == b"":
+            name = None
+        else:
+            name = self._fTitle.decode("utf-8", "ignore")
+
         lefts, rights = edges[:-1], edges[1:]
 
         nonzero = (freq != 0.0)
-        index = pandas.IntervalIndex.from_arrays(lefts[nonzero], rights[nonzero], closed="left")
+        index = pandas.IntervalIndex.from_arrays(lefts[nonzero], rights[nonzero], closed="left", name=name)
 
         data = {"count": freq[nonzero]}
-        if getattr(self, "_fSumw2", None):
-            sumw2 = self._fSumw2
-            if not underflow and not overflow:
-                sumw2 = sumw2[1:-1]
-            elif not underflow:
-                sumw2 = sumw2[1:]
-            elif not overflow:
-                sumw2 = sumw2[:-1]
-            data["variance"] = numpy.array(sumw2)[nonzero]
-        else:
-            data["variance"] = data["count"]
+        columns = ["count"]
+
+        if variance:
+            if getattr(self, "_fSumw2", None):
+                sumw2 = self._fSumw2
+                if not underflow and not overflow:
+                    sumw2 = sumw2[1:-1]
+                elif not underflow:
+                    sumw2 = sumw2[1:]
+                elif not overflow:
+                    sumw2 = sumw2[:-1]
+                data["variance"] = numpy.array(sumw2)[nonzero]
+            else:
+                data["variance"] = data["count"]
+            columns.append("variance")
 
-        return pandas.DataFrame(index=index, data=data, columns=["count", "variance"])
+        return pandas.DataFrame(index=index, data=data, columns=columns)
 
     def physt(self):
         import physt.binnings
@@ -323,13 +331,13 @@ class TH1(Methods, list):
         pass
 
     class TAxis(object):
-        def __init__(self, fNbins, fXmin, fXmax, fXbins):
+        def __init__(self, fNbins, fXmin, fXmax):
             self._fNbins = fNbins
             self._fXmin = fXmin
             self._fXmax = fXmax
 
     out = TH1.__new__(TH1)
-    out._fXaxis = TAxis(len(edges) - 1, edges[0], edges[-1], None)
+    out._fXaxis = TAxis(len(edges) - 1, edges[0], edges[-1])
     if not numpy.array_equal(edges, numpy.linspace(edges[0], edges[-1], len(edges), dtype=edges.dtype)):
         out._fXaxis._fXbins = edges.astype(">f8")
 
@@ -354,6 +362,96 @@ def __init__(self, fNbins, fXmin, fXmax, fXbins):
 
     return out
 
+def from_pandas(histogram):
+    import pandas
+
+    histogram = histogram.sort_index(ascending=True, inplace=False)
+    if not histogram.index.is_non_overlapping_monotonic:
+        raise ValueError("intervals overlap; cannot form a histogram")
+
+    sparse = histogram.index[numpy.isfinite(histogram.index.left) & numpy.isfinite(histogram.index.right)]
+    if (sparse.right[:-1] == sparse.left[1:]).all():
+        dense = sparse
+    else:
+        pairs = numpy.empty(len(sparse) * 2, dtype=numpy.float64)
+        pairs[::2] = sparse.left
+        pairs[1::2] = sparse.right
+        nonempty = numpy.empty(len(pairs), dtype=numpy.bool_)
+        nonempty[:-1] = (pairs[1:] != pairs[:-1])
+        nonempty[-1] = True
+        dense = pandas.IntervalIndex.from_breaks(pairs[nonempty], closed="left")
+
+    densehist = pandas.DataFrame(index=dense.left).join(histogram.reindex(histogram.index.left))
+    densehist.fillna(0, inplace=True)
+
+    underflowhist = histogram[numpy.isinf(histogram.index.left)]
+    overflowhist = histogram[numpy.isinf(histogram.index.right)]
+
+    content = numpy.array(densehist["count"])
+
+    sumw2 = numpy.empty(len(content) + 2, dtype=numpy.float64)
+    if "variance" in densehist.columns:
+        sumw2source = "variance"
+    else:
+        sumw2source = "count"
+    sumw2[1:-1] = densehist[sumw2source]
+    if len(underflowhist) == 0:
+        sumw2[0] = 0
+    else:
+        sumw2[0] = underflowhist[sumw2source]
+    if len(overflowhist) == 0:
+        sumw2[-1] = 0
+    else:
+        sumw2[-1] = overflowhist[sumw2source]
+
+    edges = numpy.empty(len(densehist) + 1, dtype=numpy.float64)
+    edges[:-1] = dense.left
+    edges[-1] = dense.right[-1]
+
+    class TH1(Methods, list):
+        pass
+
+    class TAxis(object):
+        def __init__(self, fNbins, fXmin, fXmax):
+            self._fNbins = fNbins
+            self._fXmin = fXmin
+            self._fXmax = fXmax
+
+    out = TH1.__new__(TH1)
+    out._fXaxis = TAxis(len(edges) - 1, edges[0], edges[-1])
+    out._fXaxis._fXbins = edges
+
+    centers = (edges[:-1] + edges[1:]) / 2.0
+    out._fEntries = content.sum()
+    out._fTsumw = content.sum()
+    out._fTsumw2 = sumw2.sum()
+    out._fTsumwx = (content * centers).sum()
+    out._fTsumwx2 = (content * centers**2).sum()
+
+    if histogram.index.name is None:
+        out._fTitle = b""
+    elif isinstance(histogram.index.name, bytes):
+        out._fTitle = histogram.index.name
+    else:
+        out._fTitle = histogram.index.name.encode("utf-8", "ignore")
+
+    out._classname, content = _histtype(content)
+
+    valuesarray = numpy.empty(len(content) + 2, dtype=content.dtype)
+    valuesarray[1:-1] = content
+    if len(underflowhist) == 0:
+        valuesarray[0] = 0
+    else:
+        valuesarray[0] = underflowhist["count"]
+    if len(overflowhist) == 0:
+        valuesarray[-1] = 0
+    else:
+        valuesarray[-1] = overflowhist["count"]
+
+    out.extend(valuesarray)
+
+    return out
+
 def from_physt(histogram):
     import physt.binnings
     import physt.histogram1d

diff --git a/uproot_methods/convert.py b/uproot_methods/convert.py
@@ -40,6 +40,8 @@ def resolve(obj):
         def types(cls, obj):
             if cls is numpy.ndarray:
                 yield ("numpy", "ndarray", len(obj.shape), str(obj.dtype))
+            elif cls.__module__ == "pandas.core.frame" and cls.__name__ == "DataFrame":
+                yield ("pandas.core.frame", "DataFrame", obj.index.__class__.__name__, set(obj.columns))
             else:
                 yield (cls.__module__, cls.__name__)
             for x in cls.__bases__:
@@ -52,6 +54,9 @@ def types(cls, obj):
         elif isinstance(obj, tuple) and any(x[:2] == ("numpy", "ndarray") for x in types(obj[0].__class__, obj[0])) and any(x[:2] == ("numpy", "ndarray") for x in types(obj[1].__class__, obj[1])) and len(obj[0]) + 1 == len(obj[1]):
             return ("uproot_methods.classes.TH1", "from_numpy", "uproot.write.objects.TH1", "TH1")
 
+        elif any(x[:3] == ("pandas.core.frame", "DataFrame", "IntervalIndex") and "count" in x[3] for x in types(obj.__class__, obj)):
+            return ("uproot_methods.classes.TH1", "from_pandas", "uproot.write.objects.TH1", "TH1")
+
         elif any(x == ("physt.histogram1d", "Histogram1D") for x in types(obj.__class__, obj)):
             return ("uproot_methods.classes.TH1", "from_physt", "uproot.write.objects.TH1", "TH1")