Skip to content

Can't groupby two or more keys #17

@natemcintosh

Description

@natemcintosh

When attempting to groupby two or more keys, I get an attribute error. This does not occur when grouping by a single key

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-11-c4a4f7e033ab> in <module>
----> 1 df.groupby(['path','time']).alt.mean().compute()

~/anaconda3/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
    154         dask.base.compute
    155         """
--> 156         (result,) = compute(self, traverse=False, **kwargs)
    157         return result
    158 

~/anaconda3/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
    396     keys = [x.__dask_keys__() for x in collections]
    397     postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398     results = schedule(dsk, keys, **kwargs)
    399     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    400 

~/anaconda3/lib/python3.6/site-packages/dask/threaded.py in get(dsk, result, cache, num_workers, pool, **kwargs)
     74     results = get_async(pool.apply_async, len(pool._pool), dsk, result,
     75                         cache=cache, get_id=_thread_get_id,
---> 76                         pack_exception=pack_exception, **kwargs)
     77 
     78     # Cleanup pools associated to dead threads

~/anaconda3/lib/python3.6/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
    465                 finish_task(dsk, key, state, results, keyorder.get)
    466                 for f in posttask_cbs:
--> 467                     f(key, res, dsk, state, worker_id)
    468 
    469                 while state['ready'] and len(state['running']) < num_workers:

~/anaconda3/lib/python3.6/site-packages/dask/cache.py in _posttask(self, key, value, dsk, state, id)
     59             duration += max(self.durations.get(k, 0) for k in deps)
     60         self.durations[key] = duration
---> 61         nb = self._nbytes(value) + overhead + sys.getsizeof(key) * 4
     62         self.cache.put(key, value, cost=duration / nb / 1e9, nbytes=nb)
     63 

~/anaconda3/lib/python3.6/site-packages/cachey/nbytes.py in nbytes(o)
     27 
     28     if name == 'pandas.core.series.Series':
---> 29         return _array(o._data.blocks[0].values) + _array(o.index._data)
     30     elif name == 'pandas.core.frame.DataFrame':
     31         return _array(o.index) + sum([_array(blk.values)

~/anaconda3/lib/python3.6/site-packages/cachey/nbytes.py in _array(x)
      3 
      4 def _array(x):
----> 5     if x.dtype == 'O':
      6         return sys.getsizeof('0'*100) * x.size
      7     elif str(x.dtype) == 'category':

AttributeError: 'NoneType' object has no attribute 'dtype'

Is this a known problem? Or a bug?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions