-
Notifications
You must be signed in to change notification settings - Fork 43
Bring threshold inside BaseReconstructor class #290
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 14 commits
4907b2e
691a1c2
fdc0e1e
bb64353
bd10912
13f3e74
630640f
985f30f
84efe18
9efe813
0d3e628
eae49b8
815cb14
46fc46f
b95e7f8
2683d8c
dedaca3
6d516ca
ff6cc80
18f261c
46ac1bf
1d61eb5
a59359f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,40 +1,341 @@ | ||
| import networkx as nx | ||
| import numpy as np | ||
| import warnings | ||
| import scipy.sparse as sp | ||
| from scipy.sparse.csgraph import minimum_spanning_tree | ||
|
|
||
|
|
||
| class BaseReconstructor: | ||
| r"""Base class for graph reconstruction algorithms. | ||
| """Base class for graph reconstruction algorithms. | ||
|
|
||
| The basic usage of a graph reconstruction algorithm is as follows: | ||
|
|
||
| >>> reconstructor = ReconstructionAlgorithm() | ||
| >>> G = reconstructor.fit(TS, <some_params>) | ||
| >>> # or alternately, G = reconstructor.results['graph'] | ||
| >>> R = ReconstructionAlgorithm() | ||
| >>> G = R.fit(TS, <some_params>).to_graph() | ||
|
|
||
| However, this is probably not the desired behavior, because (depending on | ||
| the method used) it will return a complete graph with varying weights | ||
| between the edges. The network should typically be thresholded in some way | ||
| to produce a more useful structure. | ||
|
|
||
| >>> R = ReconstructionAlgorithm() | ||
| >>> R = R.fit(TS, <some_params>) | ||
| >>> R = R.remove_self_loops().threshold('quantile', quantile=0.8) | ||
| >>> G = R.to_graph() | ||
|
|
||
| Here, `TS` is an :math:`N \times L` numpy array consisting of :math:`L` | ||
| observations for each of :math:`N` sensors. This constrains the graphs | ||
| to have integer-valued nodes. | ||
| Note that these can all be combined into a single call using method | ||
| chaining. | ||
|
|
||
| The ``results`` dict object, in addition to containing the graph | ||
| object, may also contain objects created as a side effect of | ||
| reconstructing the network, which may be useful for debugging or | ||
| considering goodness of fit. What is returned will vary between | ||
| reconstruction algorithms. | ||
| All algorithms subclass BaseReconstructor and override the fit() method; | ||
| see the documentation for each subclass's fit() method for documentation of | ||
| the algorithm. | ||
|
|
||
| """ | ||
|
|
||
| def __init__(self): | ||
| """Constructor for reconstructor classes. These take no arguments and define | ||
| three attributes: | ||
|
|
||
| 1. `self._graph`: A representation of the reconstructed network as a | ||
| NetworkX graph (or subclass). | ||
|
|
||
| 2. `self._matrix`: A representation of the reconstructed network as a | ||
| (dense) NumPy array. | ||
|
|
||
| 3. `self.results`: A dictionary for storing any intermediate data | ||
| objects or diagnostics generated by a reconstruction method. | ||
|
|
||
| `self._graph` and `self._matrix` should not be accessed directly by | ||
| users; instead, use the `to_matrix()` and `to_graph()` methods. | ||
| """ | ||
|
|
||
| self._graph = None | ||
| self._matrix = None | ||
| self.results = {} | ||
|
|
||
| def fit(self, TS, **kwargs): | ||
| """Reconstruct a graph from time series TS. | ||
| def fit(self, TS): | ||
| """The key method of the class. This takes an NxL numpy array representing a | ||
| time series and reconstructs a network from it. | ||
|
sdmccabe marked this conversation as resolved.
Outdated
|
||
|
|
||
| Any new reconstruction method should subclass from BaseReconstructor | ||
| and override fit(). This method should reconstruct the network and | ||
| assign it to either self._graph or self._matrix, then return self. | ||
|
|
||
| """ | ||
| self._matrix = np.zeros((TS.shape[0], TS.shape[0])) | ||
| return self | ||
|
|
||
| def update_matrix(self, new_mat): | ||
| """ | ||
| Update the contents of `self._matrix`. This also empties out | ||
|
leotrs marked this conversation as resolved.
|
||
| `self._graph` to avoid inconsistent state between the graph and matrix | ||
| representations of the networks. | ||
| """ | ||
| self._matrix = new_mat.copy() | ||
| self._graph = None | ||
|
|
||
| def to_dense(self): | ||
| if self._matrix is None: | ||
| raise ValueError( | ||
| "Matrix representation is missing. Have you fit the data yet?" | ||
| ) | ||
| elif sp.issparse(self._matrix): | ||
| return self._matrix.toarray() | ||
| else: | ||
| return self._matrix | ||
|
|
||
| def to_sparse(self): | ||
| if self._matrix is None: | ||
| raise ValueError( | ||
| "Matrix and graph representations both missing. " | ||
| "Have you fit the data yet?" | ||
| ) | ||
| elif sp.issparse(self._matrix): | ||
| return self._matrix | ||
| else: | ||
| return sp.csr_matrix(self._matrix) | ||
|
|
||
| def to_matrix(self): | ||
| if self._matrix is not None: | ||
| return self._matrix | ||
| else: | ||
| raise ValueError( | ||
| "Matrix and graph representations both missing. " | ||
|
sdmccabe marked this conversation as resolved.
Outdated
|
||
| "Have you fit the data yet?" | ||
| ) | ||
|
|
||
| def to_graph(self, create_using=None): | ||
| """Return the graph representation of the reconstructed network.""" | ||
| if self._graph is not None: | ||
| return self._graph | ||
| elif self._matrix is not None: | ||
| A = self._matrix.copy() | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need a copy? |
||
|
|
||
| if not sp.issparse(self._matrix): | ||
| from_array = nx.from_numpy_array | ||
| else: | ||
| from_array = nx.from_scipy_sparse_matrix | ||
|
|
||
| if create_using is None: | ||
| try: | ||
| undirected = np.allclose(A, A.T) | ||
|
sdmccabe marked this conversation as resolved.
Outdated
|
||
| except TypeError: | ||
| try: | ||
| undirected = _sparse_allclose(A) | ||
| except ValueError: | ||
| undirected = False | ||
|
|
||
| if undirected: | ||
| G = from_array(A, create_using=nx.Graph()) | ||
| else: | ||
| G = from_array(A, create_using=nx.DiGraph()) | ||
| else: | ||
| G = from_array(A, create_using=create_using) | ||
|
|
||
| self._graph = G | ||
| return self._graph | ||
| else: | ||
| raise ValueError( | ||
| "Matrix and graph representations both missing. " | ||
| "Have you fit the data yet?" | ||
| ) | ||
|
|
||
| def threshold_in_range(self, c=None, **kwargs): | ||
| """Threshold by setting values not within a list of ranges to zero. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| TS (np.ndarray): Array consisting of $L$ observations from $N$ sensors. | ||
| cutoffs (list of tuples) | ||
| When thresholding, include only edges whose correlations fall | ||
|
sdmccabe marked this conversation as resolved.
Outdated
|
||
| within a given range or set of ranges. The lower value must come | ||
| first in each tuple. For example, to keep those values whose | ||
| absolute value is between :math:`0.5` and :math:`1`, pass | ||
| ``cutoffs=[(-1, -0.5), (0.5, 1)]``. | ||
| """ | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| if 'cutoffs' in kwargs and not c: | ||
| cutoffs = kwargs['cutoffs'] | ||
| elif not c: | ||
| warnings.warn( | ||
| "Setting 'cutoffs' argument is strongly encouraged. " | ||
| "Using cutoff range of (-1, 1).", | ||
| RuntimeWarning, | ||
| ) | ||
| cutoffs = [(-1, 1)] | ||
| else: | ||
| cutoffs = c | ||
|
|
||
| mat = s.to_dense().copy() | ||
| mask_function = np.vectorize( | ||
| lambda x: any([x >= cutoff[0] and x <= cutoff[1] for cutoff in cutoffs]) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use a generator instead of a list inside of
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I don't think there will be much of a difference there so feel free to ignore
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My preference would be to rewrite the range-based threshold to just take a min and a max, since you can call it twice now.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should do either of the things suggested (generator or rewrite), but we should do one of them before this is merged |
||
| ) | ||
| mask = mask_function(mat) | ||
|
|
||
| thresholded_mat = np.where(mask, mat, 0) | ||
| thresholded_mat = sp.csr_matrix(thresholded_mat) | ||
|
|
||
| Returns | ||
| ------- | ||
| G (nx.Graph): A reconstructed graph with $N$ nodes. | ||
| s.update_matrix(thresholded_mat) | ||
| return s | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps the docstring should mention that this returns a copy?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All of the copy logic is what I'm least confident about. My thought was to throw copies all over the place and then figure out which cases were unnecessary.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like it needs revisiting. Is this the design we are going for? |
||
|
|
||
| def threshold_on_quantile(self, q=None, **kwargs): | ||
| """Threshold by setting values below a given quantile to zero. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| quantile (float) | ||
| The threshold above which to keep an element of the array, e.g., | ||
| set to zero elements below the 90th quantile of the array. | ||
|
|
||
| """ | ||
| G = nx.Graph() # reconstruct the graph | ||
| self.results['graph'] = G # and store it in self.results | ||
| # self.results[..] = .. # also store other values if needed | ||
| return G | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| if 'quantile' in kwargs and not q: | ||
| quantile = kwargs['quantile'] | ||
| elif not q: | ||
|
sdmccabe marked this conversation as resolved.
Outdated
|
||
| warnings.warn( | ||
| "Setting 'quantile' argument is strongly recommended." | ||
| "Using target quantile of 0.9 for thresholding.", | ||
| RuntimeWarning, | ||
| ) | ||
| quantile = 0.9 | ||
| else: | ||
| quantile = q | ||
| mat = s.to_dense().copy() | ||
|
|
||
| if quantile != 0: | ||
|
sdmccabe marked this conversation as resolved.
Outdated
|
||
| thresholded_mat = mat * (mat > np.percentile(mat, quantile * 100)) | ||
| else: | ||
| thresholded_mat = mat | ||
|
|
||
| s.update_matrix(sp.csr_matrix(thresholded_mat)) | ||
| return s | ||
|
|
||
| def threshold_on_degree(self, k=None, **kwargs): | ||
| """Threshold by iteratively setting the smallest entries in the weights | ||
| matrix to zero until the average degree falls below a given value. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| avg_k (float) | ||
| The average degree to target when thresholding the matrix. | ||
|
|
||
| """ | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
|
Comment on lines
+219
to
+220
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two lines are repeated all over and they seem bug-prone as hell. Maybe refactor into a helper function?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is basically the code for SQLAlchemy's
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's time to decide whether we want to merge this as is or use |
||
| if 'avg_k' in kwargs and not k: | ||
| avg_k = kwargs['avg_k'] | ||
| elif not k: | ||
|
sdmccabe marked this conversation as resolved.
Outdated
|
||
| warnings.warn( | ||
| "Setting 'avg_k' argument is strongly encouraged. Using average " | ||
| "degree of 1 for thresholding.", | ||
| RuntimeWarning, | ||
| ) | ||
| avg_k = 1 | ||
| else: | ||
| avg_k = k | ||
| mat = s._matrix.copy() | ||
|
|
||
| n = len(mat) | ||
| A = np.ones((n, n)) | ||
|
|
||
| if np.mean(np.sum(A, 1)) <= avg_k: | ||
| thresholded_mat = mat | ||
|
Comment on lines
+240
to
+239
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we computing the average row sum of an all-ones matrix?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. It's a degenerate case. |
||
| else: | ||
| for m in sorted(mat.flatten()): | ||
| A[mat == m] = 0 | ||
| if np.mean(np.sum(A, 1)) <= avg_k: | ||
| break | ||
| thresholded_mat = mat * (mat > m) | ||
|
|
||
| s.update_matrix(sp.csr_matrix(thresholded_mat)) | ||
| return s | ||
|
|
||
| def threshold(self, rule, **kwargs): | ||
| """A flexible interface to other thresholding functions. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| rule (str) | ||
| A string indicating which thresholding function to invoke. | ||
|
|
||
| kwargs (dict) | ||
| Named arguments to pass to the underlying threshold function. | ||
| """ | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| try: | ||
| if rule == 'degree': | ||
| return s.threshold_on_degree(**kwargs) | ||
| elif rule == 'range': | ||
| return s.threshold_in_range(**kwargs) | ||
| elif rule == 'quantile': | ||
| return s.threshold_on_quantile(**kwargs) | ||
| elif rule == 'custom': | ||
| mat = s.to_dense() | ||
| s.update_matrix(sp.csr_matrix(kwargs['custom_thresholder'](mat))) | ||
| return s | ||
|
|
||
| except KeyError: | ||
| raise ValueError("missing threshold parameter") | ||
|
|
||
| def _mst_sparse(self, mat): | ||
| MST = minimum_spanning_tree(mat).asformat(mat.format) | ||
| return MST | ||
|
sdmccabe marked this conversation as resolved.
Outdated
|
||
|
|
||
| def _mst_dense(self, mat): | ||
| MST = minimum_spanning_tree(mat).asformat('csr') | ||
|
sdmccabe marked this conversation as resolved.
Outdated
|
||
| return MST | ||
|
|
||
| def minimum_spanning_tree(self): | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| if sp.issparse(s._matrix): | ||
| MST = s._mst_sparse(s.to_dense()) | ||
| else: | ||
| MST = s._mst_dense(s.to_dense()) | ||
| s.update_matrix(MST) | ||
| return s | ||
|
|
||
| def _binarize_sparse(self, mat): | ||
| return np.abs(mat.sign()) | ||
|
|
||
| def _binarize_dense(self, mat): | ||
| return np.abs(np.sign(mat)) | ||
|
|
||
|
Comment on lines
+294
to
+299
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need both? |
||
| def binarize(self): | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| if sp.issparse(s._matrix): | ||
| mat = s._binarize_sparse(s._matrix) | ||
| else: | ||
| mat = s._binarize_dense(s._matrix) | ||
| s.update_matrix(mat) | ||
| return s | ||
|
|
||
| def _rsl_sparse(self, mat): | ||
| new_mat = mat.copy() | ||
| new_mat.setdiag(0) | ||
| return new_mat | ||
|
|
||
| def _rsl_dense(self, mat): | ||
| new_mat = mat.copy() | ||
| np.fill_diagonal(new_mat, 0) | ||
| return new_mat | ||
|
|
||
| def remove_self_loops(self): | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| if sp.issparse(s._matrix): | ||
| mat = s._rsl_sparse(s._matrix) | ||
| else: | ||
| mat = s._rsl_dense(s._matrix) | ||
| s.update_matrix(mat) | ||
| return s | ||
|
|
||
|
|
||
| def _sparse_allclose(mat, tol=1e-8): | ||
|
sdmccabe marked this conversation as resolved.
Outdated
|
||
| """ | ||
| np.allclose doesn't work on sparse matrices. This approximates its behavior. | ||
| """ | ||
| return abs((mat - mat.T) > tol).nnz == 0 | ||
Uh oh!
There was an error while loading. Please reload this page.