# An Inefficient Vector Space Model

In [None]:
from collections import defaultdict
from math import log, sqrt
import re

The dataset is the TIME dataset, available at http://ir.dcs.gla.ac.uk/resources/test_collections/time/

In [None]:
def import_dataset():
 """
 This function import all the articles in the TIME corpus,
 returning list of lists where each sub-list contains all the
 terms present in the document as a string.
 """
 articles = []
 with open('TIME.ALL', 'r') as f:
 tmp = []
 for row in f:
 if row.startswith("*TEXT"):
 if tmp != []:
 articles.append(tmp)
 tmp = []
 else:
 row = re.sub(r'[^a-zA-Z\s]+', '', row)
 tmp += row.split()
 return articles

In [None]:
def make_inverted_index(articles):
 """
 This function builds an inverted index as an hash table (dictionary)
 where the keys are the terms and the values are ordered lists of
 docIDs containing the term.
 """
 index = defaultdict(set)
 for docid, article in enumerate(articles):
 for term in article:
 index[term].add(docid)
 return index

In [None]:
def make_positional_index(articles):
 """
 A more advanced version of make_inverted_index. Here each posting is
 non only a document id, but a list of positions where the term is
 contained in the article.
 """
 index = defaultdict(dict)
 for docid, article in enumerate(articles):
 for pos, term in enumerate(article):
 try:
 index[term][docid].append(pos)
 except KeyError:
 index[term][docid] = [pos]
 return index

In [None]:
def documents_as_vectors(articles):
 """
 Here we generate a list of dictionaries. Each element of the list
 represents a document and each document has an associated dict where
 to each term the corresponding tf-idf is associated. Since this function
 creates a structure of size O(#documents \times #terms), it can
 be used only for small collections.
 """
 p_index = make_positional_index(articles)
 vectors = []
 n = len(articles)
 idf = {}
 for term in p_index.keys():
 idf[term] = log(n/len(p_index[term]))
 for docid in range(0, len(articles)):
 # We create a dictionary with a key for each dimension (term)
 v = {}
 for term in p_index.keys():
 try:
 tfidf = len(p_index[term][docid]) * idf[term]
 except KeyError:
 tfidf = 0
 v[term] = tfidf
 vectors.append(v)
 return vectors

In [None]:
def show_document_vector(v, docid):
 """
 This function prints, for a document represented as a vector in v, all the
 non-zero weights (both normalized and not) and the corresponding terms
 """
 non_zero_terms = [x for x in v[docid].keys() if v[docid][x] > 0]
 vector = [(x, v[docid][x]) for x in non_zero_terms]
 vector.sort(key=lambda x: x[1], reverse=True)
 length = sqrt(sum([x[1]**2 for x in vector]))
 normalized = {k: tfidf/length for k, tfidf in vector}
 for (term, tfidf) in vector:
 print(f"{term}:\t{tfidf}\t(normalized: {normalized[term]})")

In [None]:
# Example of usage
articles = import_dataset()
vectors = documents_as_vectors(articles)

In [None]:
" ".join(articles[2])

In [None]:
show_document_vector(vectors, 2)