Reading the train and test data
Examples below assume that you have read in the training and test data into variables named as they are below.
>>> train_fp = open('/data/senseval3/train/EnglishLS.train', 'r')
>>> train_data = get_data(train_fp)
>>> trainkey_fp = open('/data/senseval3/train/EnglishLS.train.key', 'r')
>>> trainkey = get_key(trainkey_fp)
>>> test_fp = open('/data/senseval3/test/EnglishLS.test', 'r')
>>> test_data = get_data(test_fp)
>>> testkey_fp = open('/data/senseval3/test/EnglishLS.test.key', 'r')
>>> testkey = get_key(testkey_fp)
get_BoW_features
output
>>> instance = train_data['activate.v']['activate.v.bnc.00044852']
>>> features = get_BoW_features(instance)
Counter({'the': 20, 'of': 14, ',': 10, 'to': 7, 'in': 6, 'are': 6, 'and': 5, 'experience': 4, 'system': 4, '.': 4, 'receptors': 4, 'or': 3, 'energy': 3, 'specific': 3, 'that': 3, 'For': 2, 'properties': 2, 'experienced': 2, '(': 2, ')': 2, 'sensory': 2, 'quality': 2, 'upon': 2, 'nervous': 2, 'which': 2, 'stimulation': 2, 'retinal': 2, 'light': 2, ';': 2, 'ear': 2, 'so': 2, 'on': 2, 'different': 2, 'organs': 2, 'endings': 2, 'neurophysiologists': 1, 'neuropsychologists': 1, 'way': 1, 'forward': 1, 'understanding': 1, 'perception': 1, 'has': 1, 'been': 1, 'correlate': 1, 'these': 1, 'dimensions': 1, 'with': 1, 'firstly': 1, 'material': 1, 'object': 1, 'event': 1, 'usually': 1, 'regarded': 1, 'as': 1, 'stimulus': 1, 'secondly': 1, 'patterns': 1, 'discharges': 1, 'Qualitative': 1, 'Aspects': 1, 'Experience': 1, 'The': 1, 'modality': 1, 'depends': 1, 'less': 1, 'reaching': 1, 'than': 1, 'parts': 1, 'activated': 1, ':': 1, 'causes': 1, 'an': 1, 'inner': 1, 'gives': 1, 'rise': 1, 'sound': 1, 'Muller': 1, "'s": 1, 'nineteenth': 1, '-': 1, 'century': 1, 'doctrine': 1, 'energies': 1, 'formalized': 1, 'ordinary': 1, 'observation': 1, 'sense': 1, 'sensitive': 1, 'physical': 1, 'world': 1, 'when': 1, 'they': 1, 'stimulated': 1, 'sensations': 1, 'those': 1, 'It': 1, 'was': 1, 'proposed': 1, 'there': 1, 'within': 1, 'attuned': 1, 'types': 1, 'example': 1, 'eye': 1, 'respond': 1, 'cochlear': 1, 'vibrations': 1, 'air': 1})
This output, and others like it, assume that you have used a Counter
to store your features. If you used a dictionary or defaultdict
, you can transform your data in a Counter
as follows, then compare to the result above.
>>> from collections import Counter
>>> counter = Counter(feature_dict)
get_colloc_features
output
>>> instance = train_data['activate.v']['activate.v.bnc.00044852']
>>> features = get_colloc_features(instance)
>>> features
Counter({'system_are_activated': 1, 'are_activated_:': 1, 'activated_:_stimulation': 1, 'are_activated': 1, 'activated_:': 1})
get_features
output
>>> features = get_features(train_data, 'activate.v')
>>> features.keys()
dict_keys(['activate.v.bnc.00024693', 'activate.v.bnc.00044852', 'activate.v.bnc.00044866', 'activate.v.bnc.00044869', 'activate.v.bnc.00050506', 'activate.v.bnc.00050507', 'activate.v.bnc.00056554', 'activate.v.bnc.00056686', 'activate.v.bnc.00067644', 'activate.v.bnc.00067694', 'activate.v.bnc.00072657', 'activate.v.bnc.00089813', 'activate.v.bnc.00099439', 'activate.v.bnc.00195891', 'activate.v.bnc.00197716', 'activate.v.bnc.00210727', 'activate.v.bnc.00248089', 'activate.v.bnc.00251499', 'activate.v.bnc.00270989', 'activate.v.bnc.00307829', 'activate.v.bnc.00312935', 'activate.v.bnc.00357129', 'activate.v.bnc.00421223', 'activate.v.bnc.00421381', 'activate.v.bnc.00430058', 'activate.v.bnc.00442716', 'activate.v.bnc.00445800', 'activate.v.bnc.00455954', 'activate.v.bnc.00466012', 'activate.v.bnc.00487700', 'activate.v.bnc.00504198', 'activate.v.bnc.00504847', 'activate.v.bnc.00508510', 'activate.v.bnc.00509400', 'activate.v.bnc.00509428', 'activate.v.bnc.00511084', 'activate.v.bnc.00511093', 'activate.v.bnc.00522938', 'activate.v.bnc.00536782', 'activate.v.bnc.00538373', 'activate.v.bnc.00554444', 'activate.v.bnc.00573075', 'activate.v.bnc.00600931', 'activate.v.bnc.00611631', 'activate.v.bnc.00617820', 'activate.v.bnc.00617926', 'activate.v.bnc.00618027', 'activate.v.bnc.00618308', 'activate.v.bnc.00618351', 'activate.v.bnc.00618998', 'activate.v.bnc.00619004', 'activate.v.bnc.00619023', 'activate.v.bnc.00619114', 'activate.v.bnc.00619117', 'activate.v.bnc.00619121', 'activate.v.bnc.00622952', 'activate.v.bnc.00630111', 'activate.v.bnc.00635669', 'activate.v.bnc.00651647', 'activate.v.bnc.00654570', 'activate.v.bnc.00660201', 'activate.v.bnc.00683751', 'activate.v.bnc.00695679', 'activate.v.bnc.00709425', 'activate.v.bnc.00709482', 'activate.v.bnc.00709491', 'activate.v.bnc.00709504', 'activate.v.bnc.00709538', 'activate.v.bnc.00709542', 'activate.v.bnc.00795202', 'activate.v.bnc.00795257', 'activate.v.bnc.00795259', 'activate.v.bnc.00795637', 'activate.v.bnc.00833644', 'activate.v.bnc.00851793', 'activate.v.bnc.00851871', 'activate.v.bnc.00851882', 'activate.v.bnc.00897037', 'activate.v.bnc.00928923', 'activate.v.bnc.00928926', 'activate.v.bnc.00928939', 'activate.v.bnc.00931302', 'activate.v.bnc.00935121', 'activate.v.bnc.00935123', 'activate.v.bnc.00935127', 'activate.v.bnc.00935319', 'activate.v.bnc.00938095', 'activate.v.bnc.00939944', 'activate.v.bnc.00941772', 'activate.v.bnc.00945604', 'activate.v.bnc.00945615', 'activate.v.bnc.00945684', 'activate.v.bnc.00953632', 'activate.v.bnc.00953642', 'activate.v.bnc.00958265', 'activate.v.bnc.00962430', 'activate.v.bnc.00969714', 'activate.v.bnc.00972290', 'activate.v.bnc.00972291', 'activate.v.bnc.00975929', 'activate.v.bnc.00997343', 'activate.v.bnc.01000398', 'activate.v.bnc.01000863', 'activate.v.bnc.01003340', 'activate.v.bnc.01009610', 'activate.v.bnc.01039017', 'activate.v.bnc.01052885', 'activate.v.bnc.01053057', 'activate.v.bnc.01064435', 'activate.v.bnc.01100525', 'activate.v.bnc.01102725', 'activate.v.bnc.01123744', 'activate.v.bnc.01128832', 'activate.v.bnc.01133779', 'activate.v.bnc.01169757', 'activate.v.bnc.01170345', 'activate.v.bnc.01171650', 'activate.v.bnc.01172145', 'activate.v.bnc.01172270', 'activate.v.bnc.01178361', 'activate.v.bnc.01199683', 'activate.v.bnc.01199697', 'activate.v.bnc.01199701', 'activate.v.bnc.01201521', 'activate.v.bnc.01204487', 'activate.v.bnc.01206797', 'activate.v.bnc.01210816', 'activate.v.bnc.01276214', 'activate.v.bnc.01276219', 'activate.v.bnc.01279048', 'activate.v.bnc.01292892', 'activate.v.bnc.01308739', 'activate.v.bnc.01318494', 'activate.v.bnc.01366679', 'activate.v.bnc.01371077', 'activate.v.bnc.01371597', 'activate.v.bnc.01371666', 'activate.v.bnc.01372084', 'activate.v.bnc.01372090', 'activate.v.bnc.01374857', 'activate.v.bnc.01415729', 'activate.v.bnc.01450597', 'activate.v.bnc.01450608', 'activate.v.bnc.01464815', 'activate.v.bnc.01479895', 'activate.v.bnc.01513543', 'activate.v.bnc.01597373', 'activate.v.bnc.01604532', 'activate.v.bnc.01605268', 'activate.v.bnc.01605269', 'activate.v.bnc.01605415', 'activate.v.bnc.01650342', 'activate.v.bnc.01716937', 'activate.v.bnc.01731519', 'activate.v.bnc.01731520', 'activate.v.bnc.01762238', 'activate.v.bnc.01767619', 'activate.v.bnc.01768110', 'activate.v.bnc.01772038', 'activate.v.bnc.01775004', 'activate.v.bnc.01775428', 'activate.v.bnc.01808341', 'activate.v.bnc.01819409', 'activate.v.bnc.01819427', 'activate.v.bnc.01819946', 'activate.v.bnc.01820100', 'activate.v.bnc.01820168', 'activate.v.bnc.01820220', 'activate.v.bnc.01820329', 'activate.v.bnc.01833487', 'activate.v.bnc.01840231', 'activate.v.bnc.01850569', 'activate.v.bnc.01870620', 'activate.v.bnc.01874636', 'activate.v.bnc.01876754', 'activate.v.bnc.01878690', 'activate.v.bnc.01884671', 'activate.v.bnc.01884953', 'activate.v.bnc.01898698', 'activate.v.bnc.01898896', 'activate.v.bnc.01899500', 'activate.v.bnc.01899834', 'activate.v.bnc.01899966', 'activate.v.bnc.01900282', 'activate.v.bnc.01918875', 'activate.v.bnc.01919328', 'activate.v.bnc.01919637', 'activate.v.bnc.01919639', 'activate.v.bnc.01919761', 'activate.v.bnc.01919762', 'activate.v.bnc.01919846', 'activate.v.bnc.01919884', 'activate.v.bnc.01919885', 'activate.v.bnc.01920012', 'activate.v.bnc.01927086', 'activate.v.bnc.01967270', 'activate.v.bnc.01967564', 'activate.v.bnc.01971058', 'activate.v.bnc.01973574', 'activate.v.bnc.01992378', 'activate.v.bnc.01992435', 'activate.v.bnc.02009265', 'activate.v.bnc.02009557', 'activate.v.bnc.02009576', 'activate.v.bnc.02009592', 'activate.v.bnc.02009633', 'activate.v.bnc.02009835', 'activate.v.bnc.02010353', 'activate.v.bnc.02010632', 'activate.v.bnc.02011552', 'activate.v.bnc.02012206', 'activate.v.bnc.02012483', 'activate.v.bnc.02012898', 'activate.v.bnc.02013464', 'activate.v.bnc.02013514', 'activate.v.bnc.02013578', 'activate.v.bnc.02013902', 'activate.v.bnc.02014009', 'activate.v.bnc.02014015', 'activate.v.bnc.02014485', 'activate.v.bnc.02015246', 'activate.v.bnc.02015290', 'activate.v.bnc.02015368', 'activate.v.bnc.02015487', 'activate.v.bnc.02015488', 'activate.v.bnc.02015531', 'activate.v.bnc.02016404', 'activate.v.bnc.02016434'])
>>> features['activate.v.bnc.00044852']
Counter({'the': 20, 'of': 14, ',': 10, 'to': 7, 'in': 6, 'are': 6, 'and': 5, 'experience': 4, 'system': 4, '.': 4, 'receptors': 4, 'or': 3, 'energy': 3, 'specific': 3, 'that': 3, 'For': 2, 'properties': 2, 'experienced': 2, '(': 2, ')': 2, 'sensory': 2, 'quality': 2, 'upon': 2, 'nervous': 2, 'which': 2, 'stimulation': 2, 'retinal': 2, 'light': 2, ';': 2, 'ear': 2, 'so': 2, 'on': 2, 'different': 2, 'organs': 2, 'endings': 2, 'neurophysiologists': 1, 'neuropsychologists': 1, 'way': 1, 'forward': 1, 'understanding': 1, 'perception': 1, 'has': 1, 'been': 1, 'correlate': 1, 'these': 1, 'dimensions': 1, 'with': 1, 'firstly': 1, 'material': 1, 'object': 1, 'event': 1, 'usually': 1, 'regarded': 1, 'as': 1, 'stimulus': 1, 'secondly': 1, 'patterns': 1, 'discharges': 1, 'Qualitative': 1, 'Aspects': 1, 'Experience': 1, 'The': 1, 'modality': 1, 'depends': 1, 'less': 1, 'reaching': 1, 'than': 1, 'parts': 1, 'activated': 1, ':': 1, 'causes': 1, 'an': 1, 'inner': 1, 'gives': 1, 'rise': 1, 'sound': 1, 'Muller': 1, "'s": 1, 'nineteenth': 1, '-': 1, 'century': 1, 'doctrine': 1, 'energies': 1, 'formalized': 1, 'ordinary': 1, 'observation': 1, 'sense': 1, 'sensitive': 1, 'physical': 1, 'world': 1, 'when': 1, 'they': 1, 'stimulated': 1, 'sensations': 1, 'those': 1, 'It': 1, 'was': 1, 'proposed': 1, 'there': 1, 'within': 1, 'attuned': 1, 'types': 1, 'example': 1, 'eye': 1, 'respond': 1, 'cochlear': 1, 'vibrations': 1, 'air': 1, 'system_are_activated': 1, 'are_activated_:': 1, 'activated_:_stimulation': 1, 'are_activated': 1, 'activated_:': 1})
index_features
output
The output of this function is too long to include in its entirety. The start and end of the output dictionary is shown.
>>> features = get_features(train_data, 'activate.v')
>>> findex = index_features(features)
>>> findex
{'!': 0, '%': 1, "'": 2, "'d": 3, "'ll": 4, "'m": 5, "'re": 6, "'s": 7, "'ve": 8, '(': 9, '(_directly_activated': 10, '(_injunctions_activated': 11, '(_tonically_activating': 12, ')': 13, ')_is_activated': 14, ')_was_activated': 15, '*': 16, '+': 17, ',': 18, ',_activated': 19, ',_activated_by': 20, ',_activates': 21, ',_activates_the': 22, ',_activating': 23, ',_activating_contraction': 24, ',_activating_or': 25, ',_activating_the': 26, ',_activating_your': 27, ',_being_activated': 28, ',_he_activated': 29, ',_is_activated': 30, ',_not_activated': 31, ',_so_activating': 32, ',_they_activate': 33, ...
<MANY ITEMS SKIPPED>
..., 'words': 7019, 'work': 7020, 'worked': 7021, 'worker': 7022, 'workers': 7023, 'working': 7024, 'works': 7025, 'workstation': 7026, 'world': 7027, 'worms': 7028, 'worth': 7029, 'worthless': 7030, 'worthlessness': 7031, 'would': 7032, 'would_activate': 7033, 'would_activate_them': 7034, 'would_be_activated': 7035, 'wound': 7036, 'wounding': 7037, 'writer': 7038, 'writers': 7039, 'writes': 7040, 'writhed': 7041, 'writing': 7042, 'written': 7043, 'wrong': 7044, 'wrongful': 7045, 'wrongfully': 7046, 'wrongly': 7047, 'wrote': 7048, 'x': 7049, 'yacht': 7050, 'yard': 7051, 'year': 7052, 'years': 7053, 'yeast': 7054, 'yelled': 7055, 'yes': 7056, 'yesterday': 7057, 'yet': 7058, 'yet_been_activated': 7059, 'yoke': 7060, 'you': 7061, 'you_activated': 7062, 'you_activated_the': 7063, 'you_to_activate': 7064, 'young': 7065, 'youngsters': 7066, 'your': 7067, 'yours': 7068, 'yourself': 7069, 'zeal': 7070, 'zero': 7071, 'zipper': 7072, 'zoom': 7073}
create_vectors
output
The output of this function is WAY too big to include in its entirety. The output assumes the features are indexed alphabetically in the feature index and the instance ids are ordered alphabetically in the array $X$. Here are some tests you can run to be sure you are getting the correct results.
>>> features = get_features(train_data, 'activate.v')
>>> findex = index_features(features)
>>> X = create_vectors(features, findex)
>>> X[0]
array([1., 0., 0., ..., 0., 0., 0.])
>>> np.where(X[0]!=0)
(array([ 0, 18, 39, 55, 227, 311, 501, 683, 697, 732, 1205,
1235, 1271, 1296, 1314, 1332, 1390, 1424, 1426, 1835, 1867, 2015,
2065, 2066, 2083, 2140, 2149, 2271, 2297, 2711, 2722, 2838, 3004,
3143, 3163, 3669, 3679, 3699, 3705, 3773, 3786, 3804, 3836, 3936,
3980, 4198, 4219, 4282, 4361, 4370, 4464, 4693, 4709, 4739, 4760,
4794, 4871, 4901, 4904, 4912, 4973, 5118, 5210, 5459, 5475, 5544,
5661, 5727, 5899, 5985, 6079, 6112, 6153, 6193, 6344, 6389, 6437,
6455, 6469, 6558, 6559, 6568, 6671, 6691, 6936, 6945, 6949, 7001,
7010, 7061, 7067]),)
>>> reverse_index = dict([(y,x) for (x,y) in findex.items()])
>>> [reverse_index[x] for x in np.where(X[0]!=0)[0]]
['!', ',', '-', '.', '?', 'Autospade', 'Do', 'I', 'If', 'It', 'Terrex', 'Tools', 'Used', 'We', 'Wolf', 'a', 'activate', 'activate_it', 'activate_it_.', 'although', 'and', 'at', 'back', 'backache', 'barrow', 'bend', 'bicycle', 'by', 'can', 'cope', 'correctly', 'day', 'digging', 'during', 'easily', 'gardening', 'general', 'get', 'give', 'had', 'handlebars', 'have', 'hefty', 'if', 'in', 'is', 'it', 'know', 'lever', 'lift', 'made', 'more', 'move', "n't", 'need', 'never', 'of', 'on', 'on_to_activate', 'one', 'out', 'periods', 'plenty', 'put', 'quite', 'rear', 'remember', 'rest', 'seen', 'should', 'soil', 'spade', 'sprung', 'step', 'suspect', 'take', 'tends', 'than', 'the', 'to', 'to_activate', 'to_activate_it', 'try', 'type', 'what', 'where', 'which', 'with', 'wo', 'you', 'your']
You may not need to use these sums directly but you can be sure your program is working as expected by checking to see if your sums match these:
>>> X.sum()
30733.0
>>> X.sum(axis=1)
array([130., 214., 119., 126., 167., 177., 186., 174., 128., 100., 163.,
120., 134., 91., 139., 121., 122., 102., 114., 68., 97., 140.,
125., 154., 149., 96., 125., 125., 132., 200., 89., 134., 123.,
89., 69., 124., 166., 142., 119., 134., 141., 111., 123., 145.,
177., 140., 202., 113., 123., 153., 227., 196., 180., 212., 150.,
70., 143., 151., 91., 67., 124., 116., 158., 156., 142., 135.,
110., 154., 130., 208., 189., 152., 175., 107., 102., 84., 86.,
135., 122., 117., 149., 152., 161., 239., 213., 116., 181., 124.,
79., 110., 94., 92., 157., 98., 168., 102., 115., 134., 114.,
248., 98., 99., 189., 106., 109., 154., 87., 105., 191., 121.,
130., 53., 109., 77., 125., 196., 134., 80., 97., 125., 206.,
185., 125., 98., 128., 100., 169., 74., 56., 137., 264., 172.,
39., 180., 173., 112., 116., 143., 215., 176., 136., 107., 115.,
153., 128., 70., 111., 112., 108., 88., 147., 142., 129., 148.,
149., 116., 170., 134., 113., 112., 134., 78., 38., 129., 51.,
130., 88., 52., 79., 208., 115., 200., 211., 236., 165., 132.,
177., 91., 34., 66., 78., 77., 102., 108., 160., 160., 108.,
107., 162., 151., 139., 106., 117., 129., 124., 140., 161., 150.,
126., 97., 85., 130., 110., 180., 195., 105., 173., 164., 215.,
192., 156., 106., 143., 116., 141., 176., 146., 173., 151., 130.,
183., 226., 188., 112., 127., 246., 114., 203.])
index_targets
output
>>> targets = train_key['activate.v']
>>> tindex = index_targets(targets)
>>> tindex
{'38201': 0, '38202': 1, '38203': 2, '38204': 3, 'U': 4}
create_targets
output
>>> targets = train_key['activate.v']
>>> tindex = index_targets(targets)
>>> y = create_targets(targets, tindex)
>>> y
array([0, 0, 2, 0, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
0, 2, 0, 4, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
1, 2, 2, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 2, 0, 0,
0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2,
0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
0, 1, 1, 0, 0, 0, 0, 1])
Using these functions with the test data
Note: Do not use the index functions with the test data. You need to use the indexes created with the training data. The sample output reloads the training indexes as a reminder, but you should not be creating these again.
>>> features_train = get_features(train_data, 'activate.v')
>>> findex = index_features(features_train)
>>> targets_train = train_key['activate.v']
>>> tindex = index_targets(targets_train)
>>> features_test = get_features(test_data, 'activate.v')
>>> len(features_test.keys())
114
>>> list(features_test.keys())[1]
'activate.v.bnc.00061340'
>>> features_test['activate.v.bnc.00061340']
Counter({',': 4, '.': 4, 'the': 4, 'one': 3, 'to': 2, 'in': 2, 'So': 1, 'provide': 1, 'ample': 1, 'warning': 1, 'fit': 1, 'smoke': 1, 'alarms': 1, 'Ideally': 1, 'site': 1, 'hall': 1, 'and': 1, 'another': 1, 'on': 1, 'landing': 1, 'Avoid': 1, 'fitting': 1, 'kitchen': 1, 'as': 1, 'fumes': 1, 'from': 1, 'cooking': 1, 'are': 1, 'often': 1, 'enough': 1, 'activate': 1, 'alarm': 1, 'The': 1, 'illustrated': 1, 'is': 1, 'by': 1, 'First': 1, 'Alert': 1, ':': 1, 'like': 1, 'most': 1, 'types': 1, 'it': 1, 'can': 1, 'be': 1, 'simply': 1, 'screwed': 1, 'into': 1, 'a': 1, 'ceiling': 1, 'SEALING': 1, 'GLAZING': 1, 'BARS': 1, 'enough_to_activate': 1, 'to_activate_the': 1, 'activate_the_alarm': 1, 'to_activate': 1, 'activate_the': 1})
>>> X_test = create_vectors(features_test, findex)
>>> np.where(X_test[0]!=0)
(array([ 18, 39, 55, 365, 551, 1214, 1218, 1332, 1353, 1656, 1839,
1854, 1867, 1938, 1955, 1975, 2094, 2131, 2148, 2210, 2271, 2453,
2477, 2536, 2666, 3385, 3413, 3415, 3577, 3590, 3668, 3725, 3801,
3804, 3926, 3968, 3980, 3981, 4340, 4369, 4409, 4530, 4693, 4871,
5049, 5053, 5444, 5654, 5791, 5899, 6066, 6090, 6296, 6301, 6469,
6517, 6522, 6558, 6667, 6827, 6888, 6929, 6936, 7001, 7005, 7027]),)
>>> reverse_index = dict([(y,x) for (x,y) in findex.items()])
>>> [reverse_index[x] for x in np.where(X_test[0]!=0)[0]]
[',', '-', '.', 'But', 'Even', 'There', 'They', 'a', 'accentuate', 'activating', 'always', 'an', 'and', 'are', 'areas', 'as', 'be', 'being', 'between', 'both', 'by', 'class', 'closed', 'commitment', 'continue', 'expected', 'extended', 'extension', 'for', 'form', 'gap', 'goals', 'has', 'have', 'ideal', 'important', 'in', 'in_activating', 'left', 'life', 'living', 'matters', 'more', 'of', 'part', 'particularly', 'pure', 'remain', 'role', 'seen', 'so', 'some', 'succeed', 'such', 'the', 'this', 'though', 'to', 'true', 'view', 'was', 'were', 'what', 'with', 'within', 'world']
>>> X_test.sum(axis=1)
array([116., 57., 84., 106., 101., 89., 67., 58., 84., 103., 140.,
101., 123., 87., 38., 125., 115., 111., 111., 82., 42., 182.,
137., 109., 163., 155., 170., 173., 200., 187., 84., 91., 103.,
164., 92., 94., 108., 136., 44., 113., 91., 129., 94., 213.,
105., 84., 102., 78., 92., 117., 67., 69., 149., 125., 51.,
126., 89., 136., 169., 81., 168., 103., 223., 74., 94., 94.,
124., 59., 59., 43., 79., 161., 154., 72., 94., 47., 43.,
138., 87., 134., 68., 204., 70., 182., 131., 144., 87., 112.,
90., 97., 116., 83., 205., 105., 108., 77., 121., 136., 160.,
167., 130., 187., 133., 194., 128., 122., 158., 173., 144., 193.,
155., 128., 99., 93.])
>>> X_test.sum()
13162.0
>>> targets_test = test_key['activate.v']
>>> y_test = create_targets(targets_test, tindex)
>>> y_test
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
0, 1, 0, 0])
fit
output
It is challenging to share complete information about the decision list because the output is very large. In the output below, the decision list was trained on the lexelt 'activate.v'
. The decision list needs to be created with a value of alpha=0.1
and the most frequent sense. Assuming you’ve indexed the same way as the sample input, the index of the most frequent sense should be 0. The output below assumes that you have written a function called mfs
that returns the most frequent sense. You do not have to write that function but you may find it helpful.
NOTE: The scores below assume using log
base 2. To take the log
base 2 of a number in Python, you can do this:
>>> import math
>>> value = 128
>>> math.log(value, 2)
7.0
Internal to the class, you should have some way of storing the rules. There are no requirements for how you store these rules. Each rule should have 3 components, though: the feature, the sense, and the score. In the output below, this is shown as a list ((feature_index, sense_index), score)
tuples.
>>> lexelt = 'activate.v'
>>> features_train = get_features(train_data, lexelt)
>>> findex = index_features(features_train)
>>> X_train = create_vectors(features_train, findex)
>>> key_train = train_key[lexelt]
>>> tindex = index_targets(key_train)
>>> y_train = create_targets(key_train, tindex)
>>> mfs = get_mfs(y_train)
>>> mfs
0
>>> alpha=0.1
>>> clf = DecisionList(alpha=alpha, default_target=mfs)
>>> clf.fit(X_train, y_train)
>>> len(clf.rules)
6763
>>> clf.rules[:20]
[((7067, 0), 8.413627929024173), ((4077, 0), 7.851749041416057), ((4047, 0), 7.787902559391432), ((1218, 0), 7.721099188707186), ((878, 1), 7.651051691178929), ((4482, 0), 7.651051691178929), ((3326, 0), 7.577428828035749), ((5218, 0), 7.577428828035749), ((1800, 0), 7.499845887083206), ((4980, 0), 7.499845887083206), ((3338, 0), 7.330916878114618), ((4296, 0), 7.330916878114618), ((5687, 0), 7.330916878114618), ((4330, 0), 7.238404739325079), ((6208, 0), 7.238404739325079), ((2365, 0), 7.03342300153745), ((4655, 0), 7.03342300153745), ((6106, 0), 7.03342300153745), ((4709, 0), 6.9188632372745955), ((5899, 0), 6.9188632372745955)]
>>> reverse_index = dict([(y,x) for (x,y) in findex.items()])
>>> [((reverse_index[a], b), c) for ((a,b),c) in clf.rules[:20]]
[(('your', 0), 8.413627929024173), (('input', 0), 7.851749041416057), (('information', 0), 7.787902559391432), (('They', 0), 7.721099188707186), (('Myc', 1), 7.651051691178929), (('make', 0), 7.651051691178929), (('et', 0), 7.577428828035749), (('point', 0), 7.577428828035749), (('al', 0), 7.499845887083206), (('output', 0), 7.499845887083206), (('every', 0), 7.330916878114618), (('language', 0), 7.330916878114618), (('representation', 0), 7.330916878114618), (('learning', 0), 7.238404739325079), (('stimuli', 0), 7.238404739325079), (('central', 0), 7.03342300153745), (('mode', 0), 7.03342300153745), (('sound', 0), 7.03342300153745), (('move', 0), 6.9188632372745955), (('seen', 0), 6.9188632372745955)]
>>> [((reverse_index[a], b), c) for ((a,b),c) in clf.rules[-20:]]
[(('be_activated', 0), 0.47732177753318644), (('We', 0), 0.40911266532173834), (('acid', 0), 0.40911266532173834), (('Ca', 0), 0.40335569423120843), (('LC', 2), 0.40335569423120843), (('chromosomes', 1), 0.40335569423120843), (('current', 0), 0.40335569423120843), (('energy', 0), 0.40335569423120843), (('rate', 0), 0.40335569423120843), (('trap', 0), 0.40335569423120843), (('c', 0), 0.33902408871066986), (('idea', 0), 0.3148733373534119), (('screen', 2), 0.3148733373534119), (('2', 0), 0.29424648196244013), (('light', 0), 0.28662122615140984), (('cells', 0), 0.26243465731512167), (('growth', 1), 0.2583119955913907), (('thus', 0), 0.2583119955913907), (('shown', 0), 0.21900978194179574), (('such', 0), 0.1913655257158237)]
predict
output
The sample output below assumes that you have already run all of the code in the previous section for the fit
funciton.
>>> features_test = get_features(test_data, lexelt)
>>> X_test = create_vectors(features_test, findex)
>>> key_test = [sense for (instance_ID, sense) in sorted(test_key[lexelt].items())]
>>> y_pred = clf.predict(X_test)
>>> y_pred
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
You will need to figure out how to score these on your own. Here are the indexes that are mislabeled:
[17, 40, 41, 45, 50, 53, 54, 65, 66, 98, 99, 102, 103, 104, 106, 108, 109, 111]
For this lexelt, you should have 96 correct and 18 incorrect, for an accuracy of $\frac{96}{96+18} = 84.2\%$.