diff --git a/README.md b/README.md index f0681ba..7b304e8 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,11 @@ Lightweight implementations of generative label models for weakly supervised machine learning +### Installation +``` +pip3 install git+https://github.com/yongzx/labelmodels.git@top-k-probable +``` + # Example Usage - Naive Bayes Model ```python # Let votes be an m x n matrix where m is the number of data examples, n is the diff --git a/labelmodels/hmm.py b/labelmodels/hmm.py index b6b9597..eb0fd2f 100644 --- a/labelmodels/hmm.py +++ b/labelmodels/hmm.py @@ -129,7 +129,6 @@ def get_most_probable_labels(self, votes, seq_starts): seq_starts = np.array(seq_starts, dtype=np.int) out = np.ndarray((votes.shape[0],), dtype=np.int) - offset = 0 for votes, seq_starts in self._create_minibatches(votes, seq_starts, 32): jll = self._get_labeling_function_likelihoods(votes) @@ -166,6 +165,139 @@ def get_most_probable_labels(self, votes, seq_starts): offset += len(res) return out + def get_k_most_probable_labels(self, votes, seq_starts, topk, return_viterbi_scores=False): + """ + Computes the topk most probable underlying sequence nodes given function + outputs. + + Based on https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py + + :param votes: m x n matrix in {0, ..., k}, where m is the sum of + the lengths of the sequences in the batch, n is the + number of labeling functions and k is the number of + classes + :param seq_starts: vector of length l of row indices in votes indicating + the start of each sequence, where l is the number of + sequences in the batch. So, votes[seq_starts[i]] + is the row vector of labeling function outputs for the + first element in the ith sequence + :return: matrix of shape (topk, m), where element is the most likely predicted labels + """ + # Converts to CSR and integers to standardize input + votes = sparse.csr_matrix(votes, dtype=np.int) + seq_starts = np.array(seq_starts, dtype=np.int) + + out = np.ndarray((topk, votes.shape[0],), dtype=np.int32) + out_scores = np.ndarray((topk, seq_starts.shape[0],), dtype=np.float64) + final_scores = [] + + EMPTY = -1 + offset = 0 + offset_scores = 0 + for votes, seq_starts in self._create_minibatches( + votes, seq_starts, 32): + # Initializes joint log likelihood with labeling function likelihood + jll = self._get_labeling_function_likelihoods(votes) + norm_start_balance = self._get_norm_start_balance() + norm_transitions = self._get_norm_transitions() + + path_scores = [] + path_indices = [] + normalization = [] + T = votes.shape[0] + seq_ends = [x - 1 for x in seq_starts] + [votes.shape[0] - 1] + + # follow https://github.com/stanfordnlp/stanza/blob/b24d124156911f95e3c5715e9dc9f75c6076619c/stanza/models/common/crf.py#L77 + # for implementation of normalization of viterbi scores + for i in range(0, T): + if i in seq_starts: + path_scores.append((jll[i] + norm_start_balance).unsqueeze(0)) + path_indices.append(torch.zeros([self.num_classes, self.num_classes])) + + alphas = (jll[i] + norm_start_balance).unsqueeze(0) # shape: (1, self.num_classes) + else: + p = path_scores[i-1].clone().unsqueeze(2) + norm_transitions + p = p.view(-1, self.num_classes) # shape: (self.num_classes, self.num_classes) + maxk = min(p.size()[0], topk) + scores, paths = torch.topk(p, k=maxk, dim=0) # paths would use (num_tags * n_permutations) nodes + + assert scores.shape == (maxk, self.num_classes) + assert paths.shape == (maxk, self.num_classes) + scores = jll[i] + scores + + path_scores.append(scores) + path_indices.append(paths) + + transition_scores = alphas.unsqueeze(2) + norm_transitions # shape: (1, self.num_classes, self.num_classes) + alphas = jll[i] + torch.logsumexp(transition_scores, dim=1) + + if i in seq_ends: + log_norm = torch.logsumexp(alphas, dim=1) + normalization.append(log_norm.item()) + + res = [] + res_scores = [] + seq_ends = [x - 1 for x in seq_starts] + [votes.shape[0] - 1] + for k in range(topk): + j = T-1 + viterbi_path = [] + viterbi_score = [] + while j >= 0: + if j in seq_ends: + seq_path_scores = path_scores[j].view(-1) + skip_rest = False + if seq_path_scores.shape[0] <= k: + # print("seq_end:", j) + skip_rest = True + + viterbi_scores, best_paths = torch.topk(seq_path_scores, k=min(topk, seq_path_scores.shape[0]), dim=0) # capped at 256 because some instances are 4-token long + if skip_rest: + viterbi_path.append(EMPTY) + viterbi_score.append(EMPTY) + else: + viterbi_path.append(best_paths[k]) + viterbi_score.append(viterbi_scores[k]) + # if k == 0: + # # because viterbi_scores include scores for other k, + # # this if-condition ensures that we only need to store the viterbi_scores for + # final_scores.append(viterbi_scores.tolist() + [-1] * (topk - seq_path_scores.shape[0])) # + if j in seq_starts: + j -= 1 + continue + if skip_rest: + viterbi_path.append(EMPTY) + else: + viterbi_path.append(int(path_indices[j].view(-1)[viterbi_path[-1]])) + j -= 1 + # # if path == -1, it means that at this k, there's no viterbi path. E.g., k = 257 and we are working with 4-token sentence + # # assert False + # if -1 in viterbi_path: + # assert False + viterbi_path = [(int(path % self.num_classes) + 1) if path != -1 else -1 for path in viterbi_path] + viterbi_path.reverse() + viterbi_score.reverse() + res.append(viterbi_path) + res_scores.append(viterbi_score) + + + for k in range(topk): + for i in range(len(res[k])): + out[k][offset + i] = res[k][i] + + for k in range(topk): + for i in range(len(res_scores[k])): + if res_scores[k][i] != EMPTY: + out_scores[k][offset_scores + i] = res_scores[k][i] - normalization[i] + else: + out_scores[k][offset_scores + i] = res_scores[k][i] + + offset += len(res[0]) + offset_scores += len(res_scores[0]) + + if return_viterbi_scores: + return out, out_scores + return out + def get_label_distribution(self, votes, seq_starts): """Returns the unary and pairwise marginals over true labels estimated by the model. diff --git a/labelmodels/linked_hmm.py b/labelmodels/linked_hmm.py index 40e4ce8..fe7c0da 100644 --- a/labelmodels/linked_hmm.py +++ b/labelmodels/linked_hmm.py @@ -1,3 +1,6 @@ +from cProfile import label +from contextlib import AsyncExitStack +from os import link from .label_model import ClassConditionalLabelModel, LearningConfig, init_random import numpy as np from scipy import sparse @@ -173,7 +176,7 @@ def get_link_propensities(self): prop = self.link_propensity.detach().numpy() return np.exp(prop) / (np.exp(prop) + 1) - def get_most_probable_labels(self, label_votes, link_votes, seq_starts): + def get_most_probable_labels(self, label_votes, link_votes, seq_starts, return_viterbi_scores=False): """ Computes the most probable underlying sequence nodes given function outputs @@ -198,6 +201,7 @@ def get_most_probable_labels(self, label_votes, link_votes, seq_starts): seq_starts = np.array(seq_starts, dtype=np.int) out = np.ndarray((label_votes.shape[0],), dtype=np.int) + final_scores = [] offset = 0 for label_votes, link_votes, seq_starts in self._create_minibatches( @@ -223,9 +227,11 @@ def get_most_probable_labels(self, label_votes, link_votes, seq_starts): seq_ends = [x - 1 for x in seq_starts] + [label_votes.shape[0] - 1] res = [] j = T-1 + _scores = list() while j >= 0: if j in seq_ends: res.append(torch.argmax(jll[j, :]).item()) + _scores.append(torch.max(jll[j, :]).item()) if j in seq_starts: j -= 1 continue @@ -233,10 +239,224 @@ def get_most_probable_labels(self, label_votes, link_votes, seq_starts): j -= 1 res = [x + 1 for x in res] res.reverse() + _scores.reverse() + final_scores += _scores for i in range(len(res)): out[offset + i] = res[i] offset += len(res) + if return_viterbi_scores: + return out, np.array(final_scores) + return out + + def compute_viterbi(self, label_votes, link_votes, seq_starts, return_viterbi_scores=False): + """ + Computes the most probable underlying sequence nodes given function + outputs + + :param label_votes: m x n matrix in {0, ..., k}, where m is the sum of + the lengths of the sequences in the batch, n is the + number of labeling functions and k is the number of + classes + :param link_votes: m x n matrix in {-1, 0, 1}, where m is the sum of + the lengths of the sequences in the batch and n is the + number of linking functions + :param seq_starts: vector of length l of row indices in votes indicating + the start of each sequence, where l is the number of + sequences in the batch. So, label_votes[seq_starts[i]] + is the row vector of labeling function outputs for the + first element in the ith sequence + :return: vector of length m, where element is the most likely predicted labels + """ + # Converts to CSR and integers to standardize input + label_votes = sparse.csr_matrix(label_votes, dtype=np.int) + link_votes = sparse.csr_matrix(link_votes, dtype=np.int) + seq_starts = np.array(seq_starts, dtype=np.int) + + out = np.ndarray((label_votes.shape[0],), dtype=np.int) + final = [] + + offset = 0 + for label_votes, link_votes, seq_starts in self._create_minibatches( + label_votes, link_votes, seq_starts, 32): + # Initializes joint log likelihood with labeling function likelihood + jll = self._get_labeling_function_likelihoods(label_votes) # (#tokens, #classes) + link_cll = self._get_linking_function_likelihoods(link_votes) # (#tokens, #classes, #classes) + norm_start_balance = self._get_norm_start_balance() # (#classes) + norm_transitions = self._get_norm_transitions() # (#classes, #classes) + + D = {} + + T = label_votes.shape[0] + # bt = torch.zeros([T, self.num_classes]) + for i in range(0, T): + new_D = dict() + if i in seq_starts: + # unary + start balance + for label in range(jll.shape[1]): + new_D[str(label)] = jll[i][label].item() + norm_start_balance[label].item() + if D: + final.append(D) + D = {} + else: + # previous score + transition + linking + unary + for prev_seq, score in D.items(): + prev_label = int(prev_seq[-1]) + for label in range(jll.shape[1]): + new_score = score + norm_transitions[prev_label][label] + link_cll[i][prev_label][label] + jll[i][label] + new_D[f"{prev_seq}{label}"] = new_score.item() + D = new_D + if D: + final.append(D) + return final + + + def get_link_propensities(self): + """Returns the model's estimated linking function propensities, i.e., + the probability that a linking function does not abstain + :return: a NumPy array with one element in [0,1] for each linking + function, representing the estimated probability that + the corresponding linking function does not abstain + """ + prop = self.link_propensity.detach().numpy() + return np.exp(prop) / (np.exp(prop) + 1) + + def get_k_most_probable_labels(self, label_votes, link_votes, seq_starts, topk, return_viterbi_scores=False): + """ + Computes the topk most probable underlying sequence nodes given function + outputs. + + Based on https://github.com/allenai/allennlp/blob/master/allennlp/nn/util.py + + :param label_votes: m x n matrix in {0, ..., k}, where m is the sum of + the lengths of the sequences in the batch, n is the + number of labeling functions and k is the number of + classes + :param link_votes: m x n matrix in {-1, 0, 1}, where m is the sum of + the lengths of the sequences in the batch and n is the + number of linking functions + :param seq_starts: vector of length l of row indices in votes indicating + the start of each sequence, where l is the number of + sequences in the batch. So, label_votes[seq_starts[i]] + is the row vector of labeling function outputs for the + first element in the ith sequence + :return: matrix of shape (topk, m), where element is the most likely predicted labels + """ + # Converts to CSR and integers to standardize input + label_votes = sparse.csr_matrix(label_votes, dtype=np.int32) + link_votes = sparse.csr_matrix(link_votes, dtype=np.int) + seq_starts = np.array(seq_starts, dtype=np.int) + + out = np.ndarray((topk, label_votes.shape[0],), dtype=np.int32) + out_scores = np.ndarray((topk, seq_starts.shape[0],), dtype=np.float64) + final_scores = [] + + offset = 0 + offset_scores = 0 + for label_votes, link_votes, seq_starts in self._create_minibatches( + label_votes, link_votes, seq_starts, 32): + # Initializes joint log likelihood with labeling function likelihood + jll = self._get_labeling_function_likelihoods(label_votes) + link_cll = self._get_linking_function_likelihoods(link_votes) + norm_start_balance = self._get_norm_start_balance() + norm_transitions = self._get_norm_transitions() + + path_scores = [] + path_indices = [] + T = label_votes.shape[0] + normalization = [] + seq_ends = [x - 1 for x in seq_starts] + [label_votes.shape[0] - 1] + + for i in range(0, T): + if i in seq_starts: + path_scores.append((jll[i] + norm_start_balance).unsqueeze(0)) + path_indices.append(torch.zeros([self.num_classes, self.num_classes])) + + alphas = (jll[i] + norm_start_balance).unsqueeze(0) # shape: (1, self.num_classes) + else: + p = path_scores[i-1].clone().unsqueeze(2) + norm_transitions + p += link_cll[i] + p = p.view(-1, self.num_classes) # shape: (self.num_classes, self.num_classes) + + maxk = min(p.size()[0], topk) + scores, paths = torch.topk(p, k=maxk, dim=0) # paths would use (num_tags * n_permutations) nodes + + assert scores.shape == (maxk, self.num_classes) + assert paths.shape == (maxk, self.num_classes) + scores = jll[i] + scores + + path_scores.append(scores) + path_indices.append(paths) + + transition_scores = alphas.unsqueeze(2) + norm_transitions + link_cll[i] # shape: (1, self.num_classes, self.num_classes) + alphas = jll[i] + torch.logsumexp(transition_scores, dim=1) # shape: (1, self.num_classes) + + if i in seq_ends: + log_norm = torch.logsumexp(alphas, dim=1) + normalization.append(log_norm.item()) + + res = [] + res_scores = [] + seq_ends = [x - 1 for x in seq_starts] + [label_votes.shape[0] - 1] + for k in range(topk): + j = T-1 + viterbi_path = [] + viterbi_score = [] + while j >= 0: + if j in seq_ends: + seq_path_scores = path_scores[j].view(-1) + skip_rest = False + if seq_path_scores.shape[0] <= k: + # print("seq_end:", j) + skip_rest = True + + viterbi_scores, best_paths = torch.topk(seq_path_scores, k=min(topk, seq_path_scores.shape[0]), dim=0) # capped at 256 because some instances are 4-token long + if skip_rest: + viterbi_path.append(-1) + viterbi_score.append(-1) + else: + viterbi_path.append(best_paths[k]) + viterbi_score.append(viterbi_scores[k]) + # if k == 0: + # # because viterbi_scores include scores for other k, + # # this if-condition ensures that we only need to store the viterbi_scores for + # final_scores.append(viterbi_scores.tolist() + [-1] * (topk - seq_path_scores.shape[0])) # + if j in seq_starts: + j -= 1 + continue + if skip_rest: + viterbi_path.append(-1) + else: + viterbi_path.append(int(path_indices[j].view(-1)[viterbi_path[-1]])) + j -= 1 + # # if path == -1, it means that at this k, there's no viterbi path. E.g., k = 257 and we are working with 4-token sentence + # # assert False + # if -1 in viterbi_path: + # assert False + viterbi_path = [(int(path % self.num_classes) + 1) if path != -1 else -1 for path in viterbi_path] + viterbi_path.reverse() + viterbi_score.reverse() + res.append(viterbi_path) + res_scores.append(viterbi_score) + + + for k in range(topk): + for i in range(len(res[k])): + out[k][offset + i] = res[k][i] + + for k in range(topk): + for i in range(len(res_scores[k])): + if res_scores[k][i] != -1: + out_scores[k][offset_scores + i] = res_scores[k][i] - normalization[i] + # out_scores[k][offset_scores + i] = res_scores[k][i] + else: + out_scores[k][offset_scores + i] = res_scores[k][i] + + offset += len(res[0]) + offset_scores += len(res_scores[0]) + + if return_viterbi_scores: + return out, out_scores return out def get_label_distribution(self, label_votes, link_votes, seq_starts): @@ -331,7 +551,7 @@ def get_label_distribution(self, label_votes, link_votes, seq_starts): p_pairwise[i] -= denom out_pairwise[offset + i, :, :] = torch.exp(p_pairwise[i]).detach() - + offset += label_votes.shape[0] return out_unary, out_pairwise diff --git a/test/test_hmm.py b/test/test_hmm.py index bcba655..0c607bb 100644 --- a/test/test_hmm.py +++ b/test/test_hmm.py @@ -1,3 +1,7 @@ +import sys +mypath = "/Users/zhengxinyong/Desktop/labelmodels" +sys.path.append(mypath) + from labelmodels import HMM import numpy as np from scipy import sparse @@ -13,163 +17,207 @@ def setUp(self): def tearDown(self): pass - def test_estimate_label_model_binary(self): - n = 5 + # def test_estimate_label_model_binary(self): + # n = 5 + # k = 2 + + # accuracies = np.array([[.9, .8], + # [.6, .7], + # [.6, .6], + # [.7, .6], + # [.8, .8]]) + # propensities = np.array([.9] * n) + # start_balance = np.array([.3, .7]) + # transitions = np.array([[.5, .5], [.3, .7]]) + + # labels_train, seq_starts_train, gold_train = _generate_data( + # 1000, 8, 12, n, accuracies, propensities, start_balance, transitions + # ) + + # model = HMM(k, n, acc_prior=0.0, balance_prior=0.0) + # model.estimate_label_model(labels_train, seq_starts_train) + + # for i in range(n): + # for j in range(k): + # diff = accuracies[i, j] - model.get_accuracies()[i, j] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(n): + # diff = propensities[i] - model.get_propensities()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(k): + # diff = start_balance[i] - model.get_start_balance()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(k): + # for j in range(k): + # diff = transitions[i, j] - model.get_transition_matrix()[i, j] + # self.assertAlmostEqual(diff, 0.0, places=1) + + # def test_estimate_label_model_multiclass(self): + # n = 5 + # k = 3 + + # accuracies = np.array([[.9, .8, .9], + # [.6, .7, .9], + # [.6, .6, .9], + # [.7, .6, .9], + # [.8, .8, .9]]) + # propensities = np.array([.9] * n) + # start_balance = np.array([.3, .3, .4]) + # transitions = np.array([[.5, .3, .2], + # [.3, .4, .3], + # [.2, .5, .3]]) + + # labels_train, seq_starts_train, gold_train = _generate_data( + # 1000, 8, 12, n, accuracies, propensities, start_balance, transitions + # ) + + # model = HMM(k, n, acc_prior=0.0, balance_prior=0.0) + # model.estimate_label_model(labels_train, seq_starts_train) + + # for i in range(n): + # for j in range(k): + # diff = accuracies[i, j] - model.get_accuracies()[i, j] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(n): + # diff = propensities[i] - model.get_propensities()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(k): + # diff = start_balance[i] - model.get_start_balance()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(k): + # for j in range(k): + # diff = transitions[i, j] - model.get_transition_matrix()[i, j] + # self.assertAlmostEqual(diff, 0.0, places=1) + + # def test_get_most_probable_labels(self): + # m = 500 + # n = 10 + # k = 3 + + # model = HMM(k, n, acc_prior=0.0) + # with torch.no_grad(): + # model.start_balance[0] = 0 + # model.start_balance[1] = 0.5 + # for i in range(n): + # model.propensity[i] = 2 + # for j in range(k): + # model.accuracy[i, j] = 2 + # for i in range(k): + # for j in range(k): + # model.transitions[i, j] = 1 if i == j else 0 + + # labels_train, seq_starts_train, gold_train = _generate_data( + # m, 8, 12, n, + # model.get_accuracies(), + # model.get_propensities(), + # model.get_start_balance(), + # model.get_transition_matrix()) + + # predictions = model.get_most_probable_labels(labels_train, seq_starts_train) + # correct = 0 + # for i in range(len(predictions)): + # if predictions[i] == gold_train[i]: + # correct += 1 + # accuracy = correct / float(len(predictions)) + # self.assertGreaterEqual(accuracy, .95) + + def test_get_k_most_probable_labels(self): + n = 3 k = 2 - accuracies = np.array([[.9, .8], - [.6, .7], - [.6, .6], - [.7, .6], - [.8, .8]]) - propensities = np.array([.9] * n) - start_balance = np.array([.3, .7]) - transitions = np.array([[.5, .5], [.3, .7]]) - - labels_train, seq_starts_train, gold_train = _generate_data( - 1000, 8, 12, n, accuracies, propensities, start_balance, transitions - ) - - model = HMM(k, n, acc_prior=0.0, balance_prior=0.0) - model.estimate_label_model(labels_train, seq_starts_train) - - for i in range(n): - for j in range(k): - diff = accuracies[i, j] - model.get_accuracies()[i, j] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(n): - diff = propensities[i] - model.get_propensities()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(k): - diff = start_balance[i] - model.get_start_balance()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(k): - for j in range(k): - diff = transitions[i, j] - model.get_transition_matrix()[i, j] - self.assertAlmostEqual(diff, 0.0, places=1) - - def test_estimate_label_model_multiclass(self): - n = 5 - k = 3 - - accuracies = np.array([[.9, .8, .9], - [.6, .7, .9], - [.6, .6, .9], - [.7, .6, .9], - [.8, .8, .9]]) - propensities = np.array([.9] * n) - start_balance = np.array([.3, .3, .4]) - transitions = np.array([[.5, .3, .2], - [.3, .4, .3], - [.2, .5, .3]]) - - labels_train, seq_starts_train, gold_train = _generate_data( - 1000, 8, 12, n, accuracies, propensities, start_balance, transitions - ) - - model = HMM(k, n, acc_prior=0.0, balance_prior=0.0) + model = HMM(k, n, init_acc=0.9, acc_prior=0.0) + labels_train = [[2, 0, 2], [1, 2, 2], [1, 0, 1], [1, 0, 1], [1, 0, 1], [1, 0, 1], [1, 0, 2], [1, 0, 1], [1, 0, 1], [1, 0, 2]] + seq_starts_train = [0, 2, 5, 8] model.estimate_label_model(labels_train, seq_starts_train) + print(model.get_accuracies()) - for i in range(n): - for j in range(k): - diff = accuracies[i, j] - model.get_accuracies()[i, j] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(n): - diff = propensities[i] - model.get_propensities()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(k): - diff = start_balance[i] - model.get_start_balance()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(k): - for j in range(k): - diff = transitions[i, j] - model.get_transition_matrix()[i, j] - self.assertAlmostEqual(diff, 0.0, places=1) - - def test_get_most_probable_labels(self): - m = 500 - n = 10 - k = 3 - - model = HMM(k, n, acc_prior=0.0) - with torch.no_grad(): - model.start_balance[0] = 0 - model.start_balance[1] = 0.5 - for i in range(n): - model.propensity[i] = 2 - for j in range(k): - model.accuracy[i, j] = 2 - for i in range(k): - for j in range(k): - model.transitions[i, j] = 1 if i == j else 0 - - labels_train, seq_starts_train, gold_train = _generate_data( - m, 8, 12, n, - model.get_accuracies(), - model.get_propensities(), - model.get_start_balance(), - model.get_transition_matrix()) + predictions, scores = model.get_k_most_probable_labels(labels_train, seq_starts_train, 8, True) + print(predictions) + print(scores) + print(np.sum(np.exp(np.ma.masked_values(scores, -1)), 0)) predictions = model.get_most_probable_labels(labels_train, seq_starts_train) - correct = 0 - for i in range(len(predictions)): - if predictions[i] == gold_train[i]: - correct += 1 - accuracy = correct / float(len(predictions)) - self.assertGreaterEqual(accuracy, .95) - - def test_get_label_distribution(self): - m = 500 - n = 10 - k = 3 - - model = HMM(k, n, acc_prior=0.0) - with torch.no_grad(): - model.start_balance[0] = 0 - model.start_balance[1] = 0.5 - for i in range(n): - model.propensity[i] = 2 - for j in range(k): - model.accuracy[i, j] = 2 - for i in range(k): - for j in range(k): - model.transitions[i, j] = 1 if i == j else 0 - - labels_train, seq_starts_train, gold_train = _generate_data( - m, 8, 12, n, - model.get_accuracies(), - model.get_propensities(), - model.get_start_balance(), - model.get_transition_matrix()) - - p_unary, p_pairwise = model.get_label_distribution( - labels_train, seq_starts_train) - - # Makes predictions using both unary and pairwise marginals - pred_unary = np.argmax(p_unary, axis=1) + 1 - pred_pairwise = np.zeros((labels_train.shape[0],), dtype=np.int) - next_seq = 0 - for i in range(labels_train.shape[0] - 1): - if next_seq == len(seq_starts_train) or i < seq_starts_train[next_seq] - 1: - # i is neither the start nor end of a sequence - pred_pairwise[i+1] = np.argmax(p_pairwise[i][pred_pairwise[i]]) - elif i == seq_starts_train[next_seq]: - # i is the start of a sequence - a, b = np.unravel_index(p_pairwise[i].argmax(), (k, k)) - pred_pairwise[i], pred_pairwise[i + 1] = a, b - next_seq += 1 - else: - # i is the end of a sequence - pass - pred_pairwise += 1 - - # Checks that predictions are accurate - for predictions in (pred_unary, pred_pairwise): - correct = 0 - for i in range(len(predictions)): - if predictions[i] == gold_train[i]: - correct += 1 - accuracy = correct / float(len(predictions)) - self.assertGreaterEqual(accuracy, .95) + print(predictions) + assert False + # with torch.no_grad(): + # model.start_balance[0] = 0 + # model.start_balance[1] = 0.5 + # for i in range(n): + # model.propensity[i] = 2 + # for j in range(k): + # model.accuracy[i, j] = 2 + # for i in range(k): + # for j in range(k): + # model.transitions[i, j] = 1 if i == j else 0 + + # labels_train, seq_starts_train, gold_train = _generate_data( + # m, 8, 12, n, + # model.get_accuracies(), + # model.get_propensities(), + # model.get_start_balance(), + # model.get_transition_matrix()) + + # predictions = model.get_most_probable_labels(labels_train, seq_starts_train) + # correct = 0 + # for i in range(len(predictions)): + # if predictions[i] == gold_train[i]: + # correct += 1 + # accuracy = correct / float(len(predictions)) + # self.assertGreaterEqual(accuracy, .95) + + # def test_get_label_distribution(self): + # m = 500 + # n = 10 + # k = 3 + + # model = HMM(k, n, acc_prior=0.0) + # with torch.no_grad(): + # model.start_balance[0] = 0 + # model.start_balance[1] = 0.5 + # for i in range(n): + # model.propensity[i] = 2 + # for j in range(k): + # model.accuracy[i, j] = 2 + # for i in range(k): + # for j in range(k): + # model.transitions[i, j] = 1 if i == j else 0 + + # labels_train, seq_starts_train, gold_train = _generate_data( + # m, 8, 12, n, + # model.get_accuracies(), + # model.get_propensities(), + # model.get_start_balance(), + # model.get_transition_matrix()) + + # p_unary, p_pairwise = model.get_label_distribution( + # labels_train, seq_starts_train) + + # # Makes predictions using both unary and pairwise marginals + # pred_unary = np.argmax(p_unary, axis=1) + 1 + # pred_pairwise = np.zeros((labels_train.shape[0],), dtype=np.int) + # next_seq = 0 + # for i in range(labels_train.shape[0] - 1): + # if next_seq == len(seq_starts_train) or i < seq_starts_train[next_seq] - 1: + # # i is neither the start nor end of a sequence + # pred_pairwise[i+1] = np.argmax(p_pairwise[i][pred_pairwise[i]]) + # elif i == seq_starts_train[next_seq]: + # # i is the start of a sequence + # a, b = np.unravel_index(p_pairwise[i].argmax(), (k, k)) + # pred_pairwise[i], pred_pairwise[i + 1] = a, b + # next_seq += 1 + # else: + # # i is the end of a sequence + # pass + # pred_pairwise += 1 + + # # Checks that predictions are accurate + # for predictions in (pred_unary, pred_pairwise): + # correct = 0 + # for i in range(len(predictions)): + # if predictions[i] == gold_train[i]: + # correct += 1 + # accuracy = correct / float(len(predictions)) + # self.assertGreaterEqual(accuracy, .95) def _generate_data(num_seqs, min_seq, max_seq, num_lfs, accuracies, diff --git a/test/test_linked_hmm.py b/test/test_linked_hmm.py index fec357a..77469a7 100644 --- a/test/test_linked_hmm.py +++ b/test/test_linked_hmm.py @@ -1,10 +1,13 @@ +import sys +mypath = "/Users/zhengxinyong/Desktop/labelmodels" +sys.path.append(mypath) + from labelmodels import LinkedHMM, LearningConfig import numpy as np from scipy import sparse import torch import unittest - class TestLinkedHMM(unittest.TestCase): def setUp(self): @@ -13,218 +16,492 @@ def setUp(self): def tearDown(self): pass - def test_estimate_label_model_binary(self): - n1 = 5 - n2 = 3 - k = 2 - - label_accuracies = np.array([[.9, .8], - [.6, .7], - [.6, .6], - [.7, .6], - [.8, .8]]) - link_accuracies = np.array([.8, .6, .8]) - label_propensities = np.array([.9] * n1) - link_propensities = np.array([.9] * n1) - start_balance = np.array([.3, .7]) - transitions = np.array([[.5, .5], [.3, .7]]) - - labels, links, seq_starts, gold = _generate_data( - 1000, 8, 12, n1, n2, - label_accuracies, - link_accuracies, - label_propensities, - link_propensities, - start_balance, - transitions - ) - - model = LinkedHMM(k, n1, n2, acc_prior=0.0, balance_prior=0.0) - config = LearningConfig() - config.epochs = 3 - model.estimate_label_model(labels, links, seq_starts, config=config) - - for i in range(n1): - for j in range(k): - diff = label_accuracies[i, j] - model.get_accuracies()[i, j] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(n2): - for j in range(k): - diff = link_accuracies[i] - model.get_link_accuracies()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(n1): - diff = label_propensities[i] - model.get_propensities()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(n2): - diff = link_propensities[i] - model.get_link_propensities()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(k): - diff = start_balance[i] - model.get_start_balance()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(k): - for j in range(k): - diff = transitions[i, j] - model.get_transition_matrix()[i, j] - self.assertAlmostEqual(diff, 0.0, places=1) - - def test_estimate_label_model_multiclass(self): - n1 = 5 - n2 = 3 - k = 3 - - label_accuracies = np.array([[.9, .8, .5], - [.6, .7, .3], - [.6, .6, .8], - [.7, .6, .6], - [.8, .8, .9]]) - link_accuracies = np.array([.8, .6, .8]) - label_propensities = np.array([.9] * n1) - link_propensities = np.array([.9] * n1) - start_balance = np.array([.3, .3, .4]) - transitions = np.array([[.5, .3, .2], - [.4, .3, .3], - [.3, .3, .4]]) - - labels, links, seq_starts, gold = _generate_data( - 1000, 8, 12, n1, n2, - label_accuracies, - link_accuracies, - label_propensities, - link_propensities, - start_balance, - transitions - ) - - model = LinkedHMM(k, n1, n2, acc_prior=0.0, balance_prior=0.0) - config = LearningConfig() - config.epochs = 4 - model.estimate_label_model(labels, links, seq_starts, config=config) - - for i in range(n1): - for j in range(k): - diff = label_accuracies[i, j] - model.get_accuracies()[i, j] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(n2): - for j in range(k): - diff = link_accuracies[i] - model.get_link_accuracies()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(n1): - diff = label_propensities[i] - model.get_propensities()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(n2): - diff = link_propensities[i] - model.get_link_propensities()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(k): - diff = start_balance[i] - model.get_start_balance()[i] - self.assertAlmostEqual(diff, 0.0, places=1) - for i in range(k): - for j in range(k): - diff = transitions[i, j] - model.get_transition_matrix()[i, j] - self.assertAlmostEqual(diff, 0.0, places=1) - - def test_get_most_probable_labels(self): - m = 500 - n1 = 3 - n2 = 5 - k = 3 - - model = LinkedHMM(k, n1, n2) - with torch.no_grad(): - model.start_balance[0] = 0 - model.start_balance[1] = 0.5 - for i in range(n1): - model.propensity[i] = 0 - for j in range(k): - model.accuracy[i, j] = 1 - for i in range(n2): - model.link_propensity[i] = 0 - model.link_accuracy[i] = 1.5 - for i in range(k): - for j in range(k): - model.transitions[i, j] = 1 if i == j else 0 - - labels, links, seq_starts, gold = _generate_data( - m, 8, 12, n1, n2, - model.get_label_accuracies(), - model.get_link_accuracies(), - model.get_label_propensities(), - model.get_link_propensities(), - model.get_start_balance(), - model.get_transition_matrix()) - - predictions = model.get_most_probable_labels(labels, links, seq_starts) - correct = 0 - for i in range(len(predictions)): - if predictions[i] == gold[i]: - correct += 1 - accuracy = correct / float(len(predictions)) - self.assertGreaterEqual(accuracy, .95) - - def test_get_label_distribution(self): - m = 500 - n1 = 3 - n2 = 5 - k = 3 - - model = LinkedHMM(k, n1, n2) - with torch.no_grad(): - model.start_balance[0] = 0 - model.start_balance[1] = 0.5 - for i in range(n1): - model.propensity[i] = 0 - for j in range(k): - model.accuracy[i, j] = 1 - for i in range(n2): - model.link_propensity[i] = 0 - model.link_accuracy[i] = 1.5 - for i in range(k): - for j in range(k): - model.transitions[i, j] = 1 if i == j else 0 - - labels, links, seq_starts, gold = _generate_data( - m, 8, 12, n1, n2, - model.get_label_accuracies(), - model.get_link_accuracies(), - model.get_label_propensities(), - model.get_link_propensities(), - model.get_start_balance(), - model.get_transition_matrix()) - - p_unary, p_pairwise = model.get_label_distribution( - labels, links, seq_starts) - - # Makes predictions using both unary and pairwise marginals - pred_unary = np.argmax(p_unary, axis=1) + 1 - pred_pairwise = np.zeros((labels.shape[0],), dtype=np.int) - next_seq = 0 - for i in range(labels.shape[0] - 1): - if next_seq == len(seq_starts) or i < seq_starts[next_seq] - 1: - # i is neither the start nor end of a sequence - pred_pairwise[i+1] = np.argmax(p_pairwise[i][pred_pairwise[i]]) - elif i == seq_starts[next_seq]: - # i is the start of a sequence - a, b = np.unravel_index(p_pairwise[i].argmax(), (k, k)) - pred_pairwise[i], pred_pairwise[i + 1] = a, b - next_seq += 1 - else: - # i is the end of a sequence - pass - pred_pairwise += 1 - - # Checks that predictions are accurate - for predictions in (pred_unary, pred_pairwise): - correct = 0 - for i in range(len(predictions)): - if predictions[i] == gold[i]: - correct += 1 - accuracy = correct / float(len(predictions)) - self.assertGreaterEqual(accuracy, .95) + # def test_estimate_label_model_binary(self): + # n1 = 5 + # n2 = 3 + # k = 2 + + # label_accuracies = np.array([[.9, .8], + # [.6, .7], + # [.6, .6], + # [.7, .6], + # [.8, .8]]) + # link_accuracies = np.array([.8, .6, .8]) + # label_propensities = np.array([.9] * n1) + # link_propensities = np.array([.9] * n1) + # start_balance = np.array([.3, .7]) + # transitions = np.array([[.5, .5], [.3, .7]]) + + # labels, links, seq_starts, gold = _generate_data( + # 1000, 8, 12, n1, n2, + # label_accuracies, + # link_accuracies, + # label_propensities, + # link_propensities, + # start_balance, + # transitions + # ) + + # model = LinkedHMM(k, n1, n2, acc_prior=0.0, balance_prior=0.0) + # config = LearningConfig() + # config.epochs = 3 + # model.estimate_label_model(labels, links, seq_starts, config=config) + + # for i in range(n1): + # for j in range(k): + # diff = label_accuracies[i, j] - model.get_accuracies()[i, j] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(n2): + # for j in range(k): + # diff = link_accuracies[i] - model.get_link_accuracies()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(n1): + # diff = label_propensities[i] - model.get_propensities()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(n2): + # diff = link_propensities[i] - model.get_link_propensities()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(k): + # diff = start_balance[i] - model.get_start_balance()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(k): + # for j in range(k): + # diff = transitions[i, j] - model.get_transition_matrix()[i, j] + # self.assertAlmostEqual(diff, 0.0, places=1) + + # def test_estimate_label_model_multiclass(self): + # n1 = 5 + # n2 = 3 + # k = 3 + + # label_accuracies = np.array([[.9, .8, .5], + # [.6, .7, .3], + # [.6, .6, .8], + # [.7, .6, .6], + # [.8, .8, .9]]) + # link_accuracies = np.array([.8, .6, .8]) + # label_propensities = np.array([.9] * n1) + # link_propensities = np.array([.9] * n1) + # start_balance = np.array([.3, .3, .4]) + # transitions = np.array([[.5, .3, .2], + # [.4, .3, .3], + # [.3, .3, .4]]) + + # labels, links, seq_starts, gold = _generate_data( + # 1000, 8, 12, n1, n2, + # label_accuracies, + # link_accuracies, + # label_propensities, + # link_propensities, + # start_balance, + # transitions + # ) + + # model = LinkedHMM(k, n1, n2, acc_prior=0.0, balance_prior=0.0) + # config = LearningConfig() + # config.epochs = 4 + # model.estimate_label_model(labels, links, seq_starts, config=config) + + # for i in range(n1): + # for j in range(k): + # diff = label_accuracies[i, j] - model.get_accuracies()[i, j] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(n2): + # for j in range(k): + # diff = link_accuracies[i] - model.get_link_accuracies()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(n1): + # diff = label_propensities[i] - model.get_propensities()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(n2): + # diff = link_propensities[i] - model.get_link_propensities()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(k): + # diff = start_balance[i] - model.get_start_balance()[i] + # self.assertAlmostEqual(diff, 0.0, places=1) + # for i in range(k): + # for j in range(k): + # diff = transitions[i, j] - model.get_transition_matrix()[i, j] + # self.assertAlmostEqual(diff, 0.0, places=1) + + # def test_get_most_probable_labels(self): + # m = 500 + # n1 = 3 + # n2 = 5 + # k = 3 + + # model = LinkedHMM(k, n1, n2) + # with torch.no_grad(): + # model.start_balance[0] = 0 + # model.start_balance[1] = 0.5 + # for i in range(n1): + # model.propensity[i] = 0 + # for j in range(k): + # model.accuracy[i, j] = 1 + # for i in range(n2): + # model.link_propensity[i] = 0 + # model.link_accuracy[i] = 1.5 + # for i in range(k): + # for j in range(k): + # model.transitions[i, j] = 1 if i == j else 0 + + # labels, links, seq_starts, gold = _generate_data( + # m, 8, 12, n1, n2, + # model.get_label_accuracies(), + # model.get_link_accuracies(), + # model.get_label_propensities(), + # model.get_link_propensities(), + # model.get_start_balance(), + # model.get_transition_matrix()) + + # predictions = model.get_most_probable_labels(labels, links, seq_starts) + # correct = 0 + # for i in range(len(predictions)): + # if predictions[i] == gold[i]: + # correct += 1 + # accuracy = correct / float(len(predictions)) + # self.assertGreaterEqual(accuracy, .95) + + # def test_get_label_distribution(self): + # m = 500 + # n1 = 3 + # n2 = 5 + # k = 3 + + # model = LinkedHMM(k, n1, n2) + # with torch.no_grad(): + # model.start_balance[0] = 0 + # model.start_balance[1] = 0.5 + # for i in range(n1): + # model.propensity[i] = 0 + # for j in range(k): + # model.accuracy[i, j] = 1 + # for i in range(n2): + # model.link_propensity[i] = 0 + # model.link_accuracy[i] = 1.5 + # for i in range(k): + # for j in range(k): + # model.transitions[i, j] = 1 if i == j else 0 + + # labels, links, seq_starts, gold = _generate_data( + # m, 8, 12, n1, n2, + # model.get_label_accuracies(), + # model.get_link_accuracies(), + # model.get_label_propensities(), + # model.get_link_propensities(), + # model.get_start_balance(), + # model.get_transition_matrix()) + + # p_unary, p_pairwise = model.get_label_distribution( + # labels, links, seq_starts) + + # # Makes predictions using both unary and pairwise marginals + # pred_unary = np.argmax(p_unary, axis=1) + 1 + # pred_pairwise = np.zeros((labels.shape[0],), dtype=np.int) + # next_seq = 0 + # for i in range(labels.shape[0] - 1): + # if next_seq == len(seq_starts) or i < seq_starts[next_seq] - 1: + # # i is neither the start nor end of a sequence + # pred_pairwise[i+1] = np.argmax(p_pairwise[i][pred_pairwise[i]]) + # elif i == seq_starts[next_seq]: + # # i is the start of a sequence + # a, b = np.unravel_index(p_pairwise[i].argmax(), (k, k)) + # pred_pairwise[i], pred_pairwise[i + 1] = a, b + # next_seq += 1 + # else: + # # i is the end of a sequence + # pass + # pred_pairwise += 1 + + # # Checks that predictions are accurate + # for predictions in (pred_unary, pred_pairwise): + # correct = 0 + # for i in range(len(predictions)): + # if predictions[i] == gold[i]: + # correct += 1 + # accuracy = correct / float(len(predictions)) + # self.assertGreaterEqual(accuracy, .95) + + def test_get_k_most_probable_labels(self): + m = 34 # num_seqs + n1 = 4 # num_labeling_funcs + n2 = 5 # num_linking_funcs + k = 4 # num_classes + + model = LinkedHMM(k, n1, n2) + with torch.no_grad(): + model.start_balance[0] = 0 + model.start_balance[1] = 0.5 + for i in range(n1): + model.propensity[i] = 0 + for j in range(k): + model.accuracy[i, j] = 1 + for i in range(n2): + model.link_propensity[i] = 0 + model.link_accuracy[i] = 1.5 + for i in range(k): + for j in range(k): + model.transitions[i, j] = 1 if i == j else 0 + + labels, links, seq_starts, gold = _generate_data( + m, 7, 7, n1, n2, + model.get_label_accuracies(), + model.get_link_accuracies(), + model.get_label_propensities(), + model.get_link_propensities(), + model.get_start_balance(), + model.get_transition_matrix()) + + # model.estimate_label_model(labels, links, seq_starts) + # # assert that when topk = 1, the output of get_k most_probable_labels is the same as get_most_probable_labels + # # for the fact that torch.argmax (used in get_most_probable_labels) == torch.topk (used in get_k_most_probable_labels) + # # when topk = 1. + # predictions = model.get_most_probable_labels(labels, links, seq_starts) + # k_predictions = model.get_k_most_probable_labels(labels, links, seq_starts, topk=1) + # self.assertIsNone(np.testing.assert_array_equal(k_predictions[0], predictions)) + + # # assert that when topk > 1, the viterbi_scores of the first sequence from get_k most_probable_labels + # # is the same as get_most_probable_labels + # viterbi_scores = model.get_most_probable_labels(labels, links, seq_starts, return_viterbi_scores=True) + # k_viterbi_scores = model.get_k_most_probable_labels(labels, links, seq_starts, topk=9, return_viterbi_scores=True) + # self.assertIsNone(np.testing.assert_array_equal(k_viterbi_scores[:, 0], viterbi_scores)) + + # # assert that when topk > 1, all sequences from get_k most_probable_labels are different from one another + # k_predictions = model.get_k_most_probable_labels(labels, links, seq_starts, topk=3) + # self.assertEqual(np.unique(k_predictions, axis=0).shape[0], k_predictions.shape[0]) + + # # assert that when topk > 1, the viterbi scores are in a non-increasing order. + + ### 👀 Ontonotes + labels, links, seq_starts = torch.load("/Users/zhengxinyong/Desktop/labelmodels/downloads/link_hmm_inputs.pt") + print(labels.shape, links.shape) #### change num_labeling_funcs according given NoABS labeling function + acc_prior = 50 + link_hmm = LinkedHMM( + num_classes=4, + num_labeling_funcs=8, + num_linking_funcs=7, + init_acc=0.7, + acc_prior=acc_prior, + balance_prior=100) + # link_hmm.estimate_label_model(labels, links, seq_starts) + link_hmm_saved_fp = f"/Users/zhengxinyong/Desktop/labelmodels/ontonotes/labelmodel_link_hmm_prior_{acc_prior}.pt" + # # torch.save(link_hmm, link_hmm_saved_fp) + link_hmm = torch.load(link_hmm_saved_fp) + print(f"✅ Done loading link hmm with acc_prior={acc_prior}.") + print("get label accuracies:") + print(link_hmm.get_label_accuracies()) + print("get label propensities:") + print(link_hmm.get_label_accuracies()) + print("get link accuracies:") + print(link_hmm.get_link_accuracies()) + print("get link propensities:") + print(link_hmm.get_link_propensities()) + print("get start balance:") + print(link_hmm.get_start_balance()) + print("get transition matrix:") + print(link_hmm.get_transition_matrix()) + + print(link_hmm.get_label_distribution(labels, links, seq_starts)) + + # K = [5000] + # for k in K: + # # viterbi_paths, viterbi_scores = link_hmm.get_k_most_probable_labels(labels[12066:12071, :], links[12066:12071, :], [0], topk=k, return_viterbi_scores=True) + # viterbi_paths, viterbi_scores = link_hmm.get_k_most_probable_labels(labels, links, seq_starts, topk=k, return_viterbi_scores=True) + # # print(seq_starts) + # # print(np.sum(np.exp(viterbi_scores), 0)) + # print(f"✅ Done generating {k} (acc prior {acc_prior}).") + # print(viterbi_scores) + # torch.save(viterbi_paths, f"/Users/zhengxinyong/Desktop/labelmodels/ontonotes/{k}_viterbi_paths_prior_{acc_prior}.pt") + # torch.save(viterbi_scores, f"/Users/zhengxinyong/Desktop/labelmodels/ontonotes/{k}_viterbi_scores_prior_{acc_prior}.pt") + + # viterbi_paths = torch.load("/Users/zhengxinyong/Desktop/labelmodels/ontonotes/5_viterbi_paths.pt") + # viterbi_scores = torch.load("/Users/zhengxinyong/Desktop/labelmodels/ontonotes/5_viterbi_scores.pt") + # print(viterbi_scores) + # print(viterbi_paths.shape) + # print(viterbi_scores.shape) + + #### Ontonotes instance check: ensure that viterbi paths are correct (when topk > possible enumeration of sequences) + # instance 780 only has 4 tokens + # if k > 256: + # self.assertEqual(sum(viterbi_paths[256][seq_starts[780]:seq_starts[781]]), -4) + # self.assertEqual(viterbi_scores[256][780], -1) + + # instance 498, 561, 751, 769, 779, 784, 821, 822, 858 have 5 tokens + + # #### Ontonotes instance check: check scores + # self.assertEqual(viterbi_scores[0][31], -112.71656799316406) + # predictions, scores = link_hmm.get_most_probable_labels(labels, links, seq_starts, return_viterbi_scores=True) + # self.assertEqual(scores[31], -112.71656799316406) + + # #### NECESSARY BUT INSUFFICIENT - comparison between get_most_probable and get_k_most_probable + # self.assertIsNone(np.testing.assert_array_equal(viterbi_paths[0, :], predictions)) + # self.assertIsNone(np.testing.assert_array_equal(viterbi_scores[0, :], scores)) + + + # #### 💻 Laptop Reviews + # labels, links, seq_starts = torch.load("/Users/zhengxinyong/Desktop/labelmodels/downloads/laptop_link_hmm_inputs.pt") + # # print(labels.shape, links.shape) #### change num_labeling_funcs according given NoABS labeling function + # link_hmm_saved_fp = f"/Users/zhengxinyong/Desktop/labelmodels/downloads/laptop_esteban_link_hmm.pt" + # link_hmm = torch.load(link_hmm_saved_fp) + # print("get label accuracies:") + # print(link_hmm.get_label_accuracies()) + # print("get label propensities:") + # print(link_hmm.get_label_accuracies()) + # print("get link accuracies:") + # print(link_hmm.get_link_accuracies()) + # print("get link propensities:") + # print(link_hmm.get_link_propensities()) + # print("get start balance:") + # print(link_hmm.get_start_balance()) + # print("get transition matrix:") + # print(link_hmm.get_transition_matrix()) + # print(f"smallest number of tokens in a sequence: {min([seq_starts[i + 1] - seq_starts[i] for i in range(len(seq_starts) - 1)])}") + # print(f"largest number of tokens in a sequence: {max([seq_starts[i + 1] - seq_starts[i] for i in range(len(seq_starts) - 1)])}") + + # K = [1, 2, 3, 4, 5] + # for k in K: + # viterbi_paths, viterbi_scores = link_hmm.get_k_most_probable_labels(labels, links, seq_starts, topk=k, return_viterbi_scores=True) + # torch.save(viterbi_paths, f"/Users/zhengxinyong/Desktop/labelmodels/laptop/{k}_viterbi_paths.pt") + # torch.save(viterbi_scores, f"/Users/zhengxinyong/Desktop/labelmodels/laptop/{k}_viterbi_scores.pt") + # print(f"✅ Laptop Reviews: Done generating {k}.") + + # predictions, scores = link_hmm.get_most_probable_labels(labels, links, seq_starts, return_viterbi_scores=True) + # viterbi_paths = torch.load(f"/Users/zhengxinyong/Desktop/labelmodels/laptop/10_viterbi_paths.pt") + + # self.assertIsNone(np.testing.assert_array_equal(viterbi_paths[0, :], predictions)) + # print(-1 in viterbi_paths) + + # #### 🥼 NCBI + # labels, links, seq_starts = torch.load("/Users/zhengxinyong/Desktop/labelmodels/downloads/ncbi_link_hmm_inputs.pt") + # # print(labels.shape, links.shape) #### change num_labeling_funcs according given NoABS labeling function + # link_hmm_saved_fp = f"/Users/zhengxinyong/Desktop/labelmodels/downloads/ncbi_esteban_link_hmm.pt" + # link_hmm = torch.load(link_hmm_saved_fp) + # print("get label accuracies:") + # print(link_hmm.get_label_accuracies()) + # print("get label propensities:") + # print(link_hmm.get_label_accuracies()) + # print("get link accuracies:") + # print(link_hmm.get_link_accuracies()) + # print("get link propensities:") + # print(link_hmm.get_link_propensities()) + # print("get start balance:") + # print(link_hmm.get_start_balance()) + # print("get transition matrix:") + # print(link_hmm.get_transition_matrix()) + + # print(f"smallest number of tokens in a sequence: {min([seq_starts[i + 1] - seq_starts[i] for i in range(len(seq_starts) - 1)])}") + # print(f"largest number of tokens in a sequence: {max([seq_starts[i + 1] - seq_starts[i] for i in range(len(seq_starts) - 1)])}") + + # for i in range(len(seq_starts) - 1): + # print(seq_starts[i + 1] - seq_starts[i], seq_starts[i], seq_starts[i + 1]) + + # K = [1, 2, 3, 4, 5] + # for k in K: + # viterbi_paths, viterbi_scores = link_hmm.get_k_most_probable_labels(labels, links, seq_starts, topk=k, return_viterbi_scores=True) + # torch.save(viterbi_paths, f"/Users/zhengxinyong/Desktop/labelmodels/ncbi/{k}_viterbi_paths.pt") + # torch.save(viterbi_scores, f"/Users/zhengxinyong/Desktop/labelmodels/ncbi/{k}_viterbi_scores.pt") + # print(f"✅ NCBI: Done generating {k}.") + + # viterbi_paths = torch.load(f"/Users/zhengxinyong/Desktop/labelmodels/ncbi/1_viterbi_paths.pt") + # # print(viterbi_paths[0, 1242:1316]) + # # predictions, scores = link_hmm.get_most_probable_labels(labels, links, seq_starts, return_viterbi_scores=True) + # self.assertIsNone(np.testing.assert_array_equal(viterbi_paths[0, :], predictions)) + + + # #### 💿 CDR + # labels, links, seq_starts = torch.load("/Users/zhengxinyong/Desktop/labelmodels/downloads/cdr_link_hmm_inputs.pt") + # # print(labels.shape, links.shape) #### change num_labeling_funcs according given NoABS labeling function + # link_hmm_saved_fp = f"/Users/zhengxinyong/Desktop/labelmodels/downloads/cdr_esteban_link_hmm.pt" + # link_hmm = torch.load(link_hmm_saved_fp) + # print("get label accuracies:") + # print(link_hmm.get_label_accuracies()) + # print("get label propensities:") + # print(link_hmm.get_label_accuracies()) + # print("get link accuracies:") + # print(link_hmm.get_link_accuracies()) + # print("get link propensities:") + # print(link_hmm.get_link_propensities()) + # print("get start balance:") + # print(link_hmm.get_start_balance()) + # print("get transition matrix:") + # print(link_hmm.get_transition_matrix()) + + # print(f"smallest number of tokens in a sequence: {min([seq_starts[i + 1] - seq_starts[i] for i in range(len(seq_starts) - 1)])}") + # print(f"largest number of tokens in a sequence: {max([seq_starts[i + 1] - seq_starts[i] for i in range(len(seq_starts) - 1)])}") + + # K = [1, 2, 3, 4, 5] + # for k in K: + # viterbi_paths, viterbi_scores = link_hmm.get_k_most_probable_labels(labels, links, seq_starts, topk=k, return_viterbi_scores=True) + # torch.save(viterbi_paths, f"/Users/zhengxinyong/Desktop/labelmodels/cdr/{k}_viterbi_paths.pt") + # torch.save(viterbi_scores, f"/Users/zhengxinyong/Desktop/labelmodels/cdr/{k}_viterbi_scores.pt") + # print(f"✅ BC5CDR: Done generating {k}.") + + # viterbi_paths = torch.load(f"/Users/zhengxinyong/Desktop/labelmodels/cdr/10_viterbi_paths.pt") + # predictions, scores = link_hmm.get_most_probable_labels(labels, links, seq_starts, return_viterbi_scores=True) + # self.assertIsNone(np.testing.assert_array_equal(viterbi_paths[0, :], predictions)) + + # print(-1 in viterbi_paths) + + + + # def test_compute_viterbi(self): + # m = 1 # num_seqs + # n1 = 4 # num_labeling_funcs + # n2 = 5 # num_linking_funcs + # k = 3 # num_classes + + # model = LinkedHMM(k, n1, n2) + # with torch.no_grad(): + # model.start_balance[0] = 0 + # model.start_balance[1] = 0.5 + # for i in range(n1): + # model.propensity[i] = 0 + # for j in range(k): + # model.accuracy[i, j] = 1 + # for i in range(n2): + # model.link_propensity[i] = 0 + # model.link_accuracy[i] = 1.5 + # for i in range(k): + # for j in range(k): + # model.transitions[i, j] = 1 if i == j else 0 + + # labels, links, seq_starts, gold = _generate_data( + # m, 20, 20, n1, n2, + # model.get_label_accuracies(), + # model.get_link_accuracies(), + # model.get_label_propensities(), + # model.get_link_propensities(), + # model.get_start_balance(), + # model.get_transition_matrix()) + + # predictions = model.get_most_probable_labels(labels, links, seq_starts) + # scores = model.get_most_probable_labels(labels, links, seq_starts, return_viterbi_scores=True) + + # path_scores_list = model.compute_viterbi(labels, links, seq_starts) + + # for i in range(len(seq_starts)): + # start_idx = seq_starts[i] + # if i == len(seq_starts) - 1: + # end_idx = len(predictions) + # else: + # end_idx = seq_starts[i + 1] + # path = ''.join(map(str, list(predictions[start_idx:end_idx]-1))) + # print(path, scores[i], path_scores_list[i][path]) + # assert scores[i] == path_scores_list[i][path] + + + # # ### TODO: test get_k_most_probable_labels + # # predictions = model.get_k_most_probable_labels(labels, links, seq_starts, topk=2) + # # scores = model.get_most_probable_labels(labels, links, seq_starts, return_viterbi_scores=True) + # # path_scores_list = model.compute_viterbi(labels, links, seq_starts) + def _generate_data(num_seqs, min_seq, max_seq, num_label_funcs, num_link_funcs, label_accs, link_accs, label_propensities, link_propensities, start_balance, transitions): # Generates sequence starts - seq_starts = np.zeros((num_seqs,), dtype=np.int) + seq_starts = np.zeros((num_seqs,), dtype=int) total_len = 0 for i in range(num_seqs): seq_len = np.random.randint(min_seq, max_seq + 1) @@ -233,7 +510,7 @@ def _generate_data(num_seqs, min_seq, max_seq, num_label_funcs, num_link_funcs, seq_starts[i + 1] = total_len # Generates sequences of gold labels - gold = np.zeros((total_len,), dtype=np.int) + gold = np.zeros((total_len,), dtype=int) next_start = 0 for i in range(total_len): if next_start < len(seq_starts) and i == seq_starts[next_start]: @@ -286,6 +563,7 @@ def _generate_data(num_seqs, min_seq, max_seq, num_label_funcs, num_link_funcs, return labels, links, seq_starts, gold + if __name__ == '__main__': unittest.main() diff --git a/wiser_tanl/linked_hmm.py b/wiser_tanl/linked_hmm.py new file mode 100644 index 0000000..50213e1 --- /dev/null +++ b/wiser_tanl/linked_hmm.py @@ -0,0 +1,132 @@ +import sys +mypath = "/Users/zhengxinyong/Desktop/labelmodels" +sys.path.append(mypath) + + +from labelmodels import LinkedHMM, LearningConfig +import numpy as np +from scipy import sparse +from scipy import special +import torch +from tqdm import tqdm + +def sample_k_labels(input_fp, output_fp, dataset, k=1000): + # TODO: integrate into LinkedHMM + labels, links, seq_starts = torch.load(f"{input_fp}/{dataset}_link_hmm_inputs.pt") + model_saved_fp = f"{input_fp}/{dataset}_link_hmm.pt" + model = torch.load(model_saved_fp) + print(f"✅ Done loading link hmm.") + print("get label accuracies:") + print(model.get_label_accuracies()) + + # p_unary, p_pairwise = model.get_label_distribution(labels, links, seq_starts) + # torch.save([p_unary, p_pairwise], f"{output_fp}/emp_dist/{dataset}_unary_pairwise.pt") + p_unary, p_pairwise = torch.load(f"{output_fp}/emp_dist/{dataset}_unary_pairwise.pt") + + print(f"⛏ Sampling k={k} label sequences") + paths = np.zeros((k, p_unary.shape[0]), dtype=int) + instance_idx = -1 + num_choices = p_unary.shape[1] + for i in tqdm(range(k)): + for j in range(p_unary.shape[0]): + if j in seq_starts: + instance_idx += 1 + label = np.random.choice(num_choices, size=1, p=p_unary[j]/p_unary[j].sum()) + paths[i][j] = label[0] + 1 + else: + prev_label = paths[i][j - 1] - 1 + next_label_dist = p_pairwise[j - 1][prev_label] + label = np.random.choice(num_choices, size=1, p=next_label_dist/next_label_dist.sum()) + paths[i][j] = label[0] + 1 + instance_idx = -1 + + torch.save(paths, f"{output_fp}/{dataset}_{k}_sampled_paths.pt") + return paths + +def sampling_empirical_distribution(input_fp, output_fp, dataset, k=100): + paths = torch.load(f"{output_fp}/{dataset}_{k}_sampled_paths.pt") + model_saved_fp = f"{input_fp}/{dataset}_link_hmm.pt" + model = torch.load(model_saved_fp) + empirical_dist = np.zeros((paths.shape[1], model.get_label_accuracies().shape[1])) + print("empirical_dist.shape:", empirical_dist.shape) + + for path_k in tqdm(range(paths.shape[0])): + path = paths[path_k] + + for i in range(len(path)): + if path[i] < 0: + continue + + empirical_dist[i][path[i] - 1] += 1 + empirical_dist = empirical_dist / k + np.save(open(f"{output_fp}/emp_dist/{dataset}_sampled_{k}.npy", "wb"), empirical_dist) + +def top_k_empirical_distribution(input_fp, output_fp, dataset, acc_prior=50, k=50): + labels, links, seq_starts = torch.load(f"{input_fp}/{dataset}_link_hmm_inputs.pt") + link_hmm_saved_fp = f"{input_fp}/labelmodel_link_hmm_prior_{acc_prior}.pt" + link_hmm = torch.load(link_hmm_saved_fp) + print(f"✅ Done loading link hmm with acc_prior={acc_prior}.") + print("get label accuracies:") + print(link_hmm.get_label_accuracies()) + # print("get label propensities:") + # print(link_hmm.get_label_accuracies()) + # print("get link accuracies:") + # print(link_hmm.get_link_accuracies()) + # print("get link propensities:") + # print(link_hmm.get_link_propensities()) + # print("get start balance:") + # print(link_hmm.get_start_balance()) + # print("get transition matrix:") + # print(link_hmm.get_transition_matrix()) + + # getting empirical distribution + print(f"Getting empirical distribution (k = {k}):") + viterbi_paths = torch.load(f"{output_fp}/{k}_viterbi_paths_prior_50.pt") + viterbi_scores = torch.load(f"{output_fp}/{k}_viterbi_scores_prior_50.pt") + viterbi_scores = np.ma.masked_where(viterbi_scores == -1, viterbi_scores) + + empirical_dist = np.zeros((viterbi_paths.shape[1], link_hmm.get_label_accuracies().shape[1])) + print("empirical_dist.shape:", empirical_dist.shape) + + for path_k in tqdm(range(viterbi_paths.shape[0])): + path = viterbi_paths[path_k] + + for i in range(len(path)): + if path[i] < 0: + continue + + if i in seq_starts: + score = viterbi_scores[path_k][np.where(seq_starts == i)[0][0]] + all_scores = viterbi_scores[:, np.where(seq_starts == i)[0][0]] + all_scores = all_scores[all_scores.mask == False] + total_score = special.logsumexp(all_scores.filled()) + + empirical_dist[i][path[i] - 1] += np.exp(score - total_score) # weighted by scores + + print(empirical_dist[17065:17069]) + np.save(open(f"{output_fp}/emp_dist/{dataset}_top_{k}.npy", "wb"), empirical_dist) + +def get_posterior_marginal(dataset): + # right now I treat unary marginal as posterior marginal + ... + + +if __name__ == '__main__': + # top_k_empirical_distribution( + # input_fp="/Users/zhengxinyong/Desktop/labelmodels/inputs/ontonotes", + # output_fp="/Users/zhengxinyong/Desktop/labelmodels/outputs/ontonotes", + # dataset="ontonotes") + + + for k in [2000, 3000, 10000]: + sample_k_labels( + input_fp="/Users/zhengxinyong/Desktop/labelmodels/inputs/ontonotes", + output_fp="/Users/zhengxinyong/Desktop/labelmodels/outputs/ontonotes", + dataset="ontonotes", + k=k) + + sampling_empirical_distribution( + input_fp="/Users/zhengxinyong/Desktop/labelmodels/inputs/ontonotes", + output_fp="/Users/zhengxinyong/Desktop/labelmodels/outputs/ontonotes", + dataset="ontonotes", + k=k)