jnothman · jnothman · Jun 3, 2019 · May 30, 2019 · Jun 3, 2019 · Jun 3, 2019
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,3 +1,20 @@
+What's new in version 0.3
+-------------------------
+
+- Added `from_contents` to provide an alternative, intuitive way of specifying
+ category membership of elements.
+- Fixed the display of the "intersection size" label on plots, which had been
+ missing.
+- Trying to improve nomenclature, upsetplot now avoids "set" to refer to the
+ top-level sets, which are now to be known as "categories". This matches the
+ intuition that categories are named, logical groupings, as opposed to
+ "subsets". To this end:
+
+ - `generate_data` now names its categories "cat1", "cat2" etc. rather than
+ "set1", "set2", etc.
+ - the `sort_sets_by` parameter has been renamed to `sort_categories_by` and
+ will be removed in version 0.5.
+
 What's new in version 0.2.1
 ---------------------------
 

diff --git a/README.rst b/README.rst
@@ -12,15 +12,15 @@ more readable. Documentation is at https://upsetplot.readthedocs.io.
 This ``upsetplot`` library tries to provide a simple interface backed by an
 extensible, object-oriented design.
 
-The basic input format is a `pandas.Series` containing counts
-corresponding to set intersection sizes. The index indicates which rows
-pertain to which sets, by having multiple boolean indices, like ``example``
-in the following::
+The basic input format is a `pandas.Series` containing counts corresponding to
+subset sizes, where each subset is an intersection of named categories. The
+index of the Series indicates which rows pertain to which categories, by having
+multiple boolean indices, like ``example`` in the following::
 
  >>> from upsetplot import generate_data
  >>> example = generate_data(aggregated=True)
  >>> example # doctest: +NORMALIZE_WHITESPACE
- set0 set1 set2
+ cat0 cat1 cat2
  False False False 56
  True 283
  True False 1279
@@ -43,17 +43,17 @@ makes:
 .. image:: https://upsetplot.readthedocs.io/en/latest/_images/sphx_glr_plot_generated_001.png
  :target: ../auto_examples/plot_generated.html
 
-This plot shows the cardinality of every set combination seen in our data. The
-leftmost column counts items absent from any set. The next three columns count
-items only in ``set1``, ``set2`` and ``set3`` respectively, with following
-columns showing cardinalities for items in each combination of exactly two
-named sets. The rightmost column counts items in all three sets.
+This plot shows the cardinality of every category combination seen in our data.
+The leftmost column counts items absent from any category. The next three
+columns count items only in ``cat1``, ``cat2`` and ``cat3`` respectively, with
+following columns showing cardinalities for items in each combination of
+exactly two named sets. The rightmost column counts items in all three sets.
 
 Rotation
 ........
 
-We call the above plot style "horizontal" because the set intersections are
-presented from left to right. `Vertical plots
+We call the above plot style "horizontal" because the category intersections
+are presented from left to right. `Vertical plots
 <https://upsetplot.readthedocs.io/en/latest/auto_examples/plot_vertical.html>`__
 are also supported!
 
@@ -71,30 +71,29 @@ in each subset.
 .. image:: https://upsetplot.readthedocs.io/en/latest/_images/sphx_glr_plot_boston_001.png
  :target: https://upsetplot.readthedocs.io/en/latest/auto_examples/plot_boston.html
 
-
 Loading datasets
 ................
 
 While the dataset above is randomly generated, you can prepare your own dataset
 for input to upsetplot. A helpful tool is `from_memberships`, which allows
-us to reconstruct the example above by indicating each data point's set
+us to reconstruct the example above by indicating each data point's category
 membership::
 
  >>> from upsetplot import from_memberships
  >>> example = from_memberships(
  ... [[],
- ... ['set2'],
- ... ['set1'],
- ... ['set1', 'set2'],
- ... ['set0'],
- ... ['set0', 'set2'],
- ... ['set0', 'set1'],
- ... ['set0', 'set1', 'set2'],
+ ... ['cat2'],
+ ... ['cat1'],
+ ... ['cat1', 'cat2'],
+ ... ['cat0'],
+ ... ['cat0', 'cat2'],
+ ... ['cat0', 'cat1'],
+ ... ['cat0', 'cat1', 'cat2'],
  ... ],
  ... data=[56, 283, 1279, 5882, 24, 90, 429, 1957]
  ... )
  >>> example # doctest: +NORMALIZE_WHITESPACE
- set0 set1 set2
+ cat0 cat1 cat2
  False False False 56
  True 283
  True False 1279
@@ -105,6 +104,8 @@ membership::
  True 1957
  dtype: int64
 
+See also `from_contents`, another way to describe categorised data.
+
 Installation
 ------------
 
@@ -130,8 +131,8 @@ Why an alternative to py-upset?
 Probably for petty reasons. It appeared `py-upset
 <https://github.com/ImSoErgodic/py-upset>`_ was not being maintained. Its
 input format was undocumented, inefficient and, IMO, inappropriate. It did not
-facilitate showing plots of each set intersection distribution as in Lex et
-al's work introducing UpSet plots. Nor did it include the horizontal bar plots
+facilitate showing plots of each subset's distribution as in Lex et al's work
+introducing UpSet plots. Nor did it include the horizontal bar plots
 illustrated there. It did not support Python 2. I decided it would be easier to
 construct a cleaner version than to fix it.
 

diff --git a/upsetplot/data.py b/upsetplot/data.py
@@ -12,47 +12,47 @@ def generate_data(seed=0, n_samples=10000, n_sets=3, aggregated=False):
  df = pd.DataFrame({'value': np.zeros(n_samples)})
  for i in range(n_sets):
  r = rng.rand(n_samples)
- df['set%d' % i] = r > rng.rand()
+ df['cat%d' % i] = r > rng.rand()
  df['value'] += r
 
- df.set_index(['set%d' % i for i in range(n_sets)], inplace=True)
+ df.set_index(['cat%d' % i for i in range(n_sets)], inplace=True)
  if aggregated:
  return df.value.groupby(level=list(range(n_sets))).count()
  return df.value
 
 
 def from_memberships(memberships, data=None):
- """Load data where each sample has a collection of set names
+ """Load data where each sample has a collection of category names
 
  The output should be suitable for passing to `UpSet` or `plot`.
 
  Parameters
  ----------
  memberships : sequence of collections of strings
  Each element corresponds to a data point, indicating the sets it is a
- member of. Each set is named by a string.
+ member of. Each category is named by a string.
  data : Series-like or DataFrame-like, optional
- If given, the index of set memberships is attached to this data.
+ If given, the index of category memberships is attached to this data.
  It must have the same length as `memberships`.
  If not given, the series will contain the value 1.
 
  Returns
  -------
  DataFrame or Series
- `data` is returned with its index indicating set membership.
+ `data` is returned with its index indicating category membership.
  It will be a Series if `data` is a Series or 1d numeric array.
- The index will have levels ordered by set names.
+ The index will have levels ordered by category names.
 
  Examples
  --------
  >>> from upsetplot import from_memberships
  >>> from_memberships([
- ... ['set1', 'set3'],
- ... ['set2', 'set3'],
- ... ['set1'],
+ ... ['cat1', 'cat3'],
+ ... ['cat2', 'cat3'],
+ ... ['cat1'],
  ... []
  ... ]) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE
- set1 set2 set3
+ cat1 cat2 cat3
  True False True 1
  False True True 1
  True False False 1
@@ -61,13 +61,13 @@ def from_memberships(memberships, data=None):
  >>> # now with data:
  >>> import numpy as np
  >>> from_memberships([
- ... ['set1', 'set3'],
- ... ['set2', 'set3'],
- ... ['set1'],
+ ... ['cat1', 'cat3'],
+ ... ['cat2', 'cat3'],
+ ... ['cat1'],
  ... []
  ... ], data=np.arange(12).reshape(4, 3)) # doctest: +NORMALIZE_WHITESPACE
  0 1 2
- set1 set2 set3
+ cat1 cat2 cat3
  True False True 0 1 2
  False True True 3 4 5
  True False False 6 7 8
@@ -77,9 +77,9 @@ def from_memberships(memberships, data=None):
  for names in memberships])
  for set_name in df.columns:
  if not hasattr(set_name, 'lower'):
- raise ValueError('Set names should be strings')
+ raise ValueError('Category names should be strings')
  if df.shape[1] == 0:
- raise ValueError('Require at least one set. None were found.')
+ raise ValueError('Require at least one category. None were found.')
  df.sort_index(axis=1, inplace=True)
  df.fillna(False, inplace=True)
  df = df.astype(bool)
@@ -118,7 +118,7 @@ def from_contents(contents, data=None, id_column='id'):
  Returns
  -------
  DataFrame
- `data` is returned with its index indicating set membership,
+ `data` is returned with its index indicating category membership,
  including a column named according to id_column.
  If data is not given, the order of rows is not assured.
 

diff --git a/upsetplot/plotting.py b/upsetplot/plotting.py
@@ -1,5 +1,6 @@
 from __future__ import print_function, division, absolute_import
 
+import warnings
 import itertools
 
 import numpy as np
@@ -9,7 +10,7 @@
 from matplotlib.tight_layout import get_renderer
 
 
-def _process_data(df, sort_by, sort_sets_by, sum_over):
+def _process_data(df, sort_by, sort_categories_by, sum_over):
  if df.ndim == 1:
  data = df
  df = pd.DataFrame({'_value': df})
@@ -40,10 +41,10 @@ def _process_data(df, sort_by, sort_sets_by, sum_over):
  totals = [data[data.index.get_level_values(name).values.astype(bool)].sum()
  for name in data.index.names]
  totals = pd.Series(totals, index=data.index.names)
- if sort_sets_by == 'cardinality':
+ if sort_categories_by == 'cardinality':
  totals.sort_values(ascending=False, inplace=True)
- elif sort_sets_by is not None:
- raise ValueError('Unknown sort_sets_by: %r' % sort_sets_by)
+ elif sort_categories_by is not None:
+ raise ValueError('Unknown sort_categories_by: %r' % sort_categories_by)
  df = df.reorder_levels(totals.index.values)
  data = data.reorder_levels(totals.index.values)
 
@@ -149,29 +150,32 @@ class UpSet:
  Parameters
  ----------
  data : pandas.Series or pandas.DataFrame
- Values for each set to plot.
- Should have multi-index where each level is binary,
- corresponding to set membership.
+ Elements associated with categories (a DataFrame), or the size of each
+ subset of categories (a Series).
+ Should have MultiIndex where each level is binary,
+ corresponding to category membership.
  If a DataFrame, `sum_over` must be a string or False.
  orientation : {'horizontal' (default), 'vertical'}
  If horizontal, intersections are listed from left to right.
  sort_by : {'cardinality', 'degree'}
- If 'cardinality', set intersections are listed from largest to
- smallest value.
- If 'degree', they are listed in order of the number of sets
+ If 'cardinality', subset are listed from largest to smallest.
+ If 'degree', they are listed in order of the number of categories
  intersected.
- sort_sets_by : {'cardinality', None}
- Whether to sort the overall sets by total cardinality, or leave them
+ sort_categories_by : {'cardinality', None}
+ Whether to sort the categories by total cardinality, or leave them
  in the provided order.
+
+ .. versionadded: 0.3
+ Replaces sort_sets_by
  sum_over : str, False or None (default)
  Must be specified when `data` is a DataFrame. If False, the
  intersection plot will show the count of each subset. Otherwise, it
  shows the sum of the specified field.
  facecolor : str
  Color for bar charts and dots.
  with_lines : bool
- Whether to show lines joining dots in the matrix, to mark multiple sets
- being intersected.
+ Whether to show lines joining dots in the matrix, to mark multiple
+ categories being intersected.
  element_size : float or None
  Side length in pt. If None, size is estimated to fit figure
  intersection_plot_elements : int
@@ -184,14 +188,19 @@ class UpSet:
  Whether to label the intersection size bars with the cardinality
  of the intersection. When a string, this formats the number.
  For example, '%d' is equivalent to True.
+ sort_sets_by
+ .. deprecated: 0.3
+ Replaced by sort_categories_by, this parameter will be removed in
+ version 0.5.
  """
  _default_figsize = (10, 6)
 
  def __init__(self, data, orientation='horizontal', sort_by='degree',
- sort_sets_by='cardinality', sum_over=None, facecolor='black',
+ sort_categories_by='cardinality', sum_over=None,
+ facecolor='black',
  with_lines=True, element_size=32,
  intersection_plot_elements=6, totals_plot_elements=2,
- show_counts=''):
+ show_counts='', sort_sets_by='deprecated'):
 
  self._horizontal = orientation == 'horizontal'
  self._reorient = _identity if self._horizontal else _transpose
@@ -204,10 +213,15 @@ def __init__(self, data, orientation='horizontal', sort_by='degree',
  'elements': intersection_plot_elements}]
  self._show_counts = show_counts
 
+ if sort_sets_by != 'deprecated':
+ sort_categories_by = sort_sets_by
+ warnings.warn('sort_sets_by was deprecated in version 0.3 and '
+ 'will be removed in version 0.5', DeprecationWarning)
+
  (self._df, self.intersections,
  self.totals) = _process_data(data,
  sort_by=sort_by,
- sort_sets_by=sort_sets_by,
+ sort_categories_by=sort_categories_by,
  sum_over=sum_over)
  if not self._horizontal:
  self.intersections = self.intersections[::-1]
@@ -352,13 +366,13 @@ def plot_matrix(self, ax):
  """
  ax = self._reorient(ax)
  data = self.intersections
- n_sets = data.index.nlevels
+ n_cats = data.index.nlevels
 
  idx = np.flatnonzero(data.index.to_frame()[data.index.names].values)
- c = np.array(['lightgrey'] * len(data) * n_sets, dtype='O')
+ c = np.array(['lightgrey'] * len(data) * n_cats, dtype='O')
  c[idx] = self._facecolor
- x = np.repeat(np.arange(len(data)), n_sets)
- y = np.tile(np.arange(n_sets), len(data))
+ x = np.repeat(np.arange(len(data)), n_cats)
+ y = np.tile(np.arange(n_cats), len(data))
  if self._element_size is not None:
  s = (self._element_size * .35) ** 2
  else:
@@ -375,7 +389,7 @@ def plot_matrix(self, ax):
  lw=2, colors=self._facecolor)
 
  tick_axis = ax.yaxis
- tick_axis.set_ticks(np.arange(n_sets))
+ tick_axis.set_ticks(np.arange(n_cats))
  tick_axis.set_ticklabels(data.index.names,
  rotation=0 if self._horizontal else -90)
  ax.xaxis.set_visible(False)

diff --git a/upsetplot/tests/test_data.py b/upsetplot/tests/test_data.py
@@ -10,9 +10,9 @@
 
 @pytest.mark.parametrize('typ', [set, list, tuple, iter])
 def test_from_memberships_no_data(typ):
- with pytest.raises(ValueError, match='at least one set'):
+ with pytest.raises(ValueError, match='at least one category'):
  from_memberships([])
- with pytest.raises(ValueError, match='at least one set'):
+ with pytest.raises(ValueError, match='at least one category'):
  from_memberships([[], []])
  with pytest.raises(ValueError, match='strings'):
  from_memberships([[1]])