{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# An Inefficient Vector Space Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "from math import log, sqrt\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The dataset is the TIME dataset, available at http://ir.dcs.gla.ac.uk/resources/test_collections/time/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def import_dataset():\n",
    "    \"\"\"\n",
    "    This function import all the articles in the TIME corpus,\n",
    "    returning list of lists where each sub-list contains all the\n",
    "    terms present in the document as a string.\n",
    "    \"\"\"\n",
    "    articles = []\n",
    "    with open('TIME.ALL', 'r') as f:\n",
    "        tmp = []\n",
    "        for row in f:\n",
    "            if row.startswith(\"*TEXT\"):\n",
    "                if tmp != []:\n",
    "                    articles.append(tmp)\n",
    "                tmp = []\n",
    "            else:\n",
    "                row = re.sub(r'[^a-zA-Z\\s]+', '', row)\n",
    "                tmp += row.split()\n",
    "    return articles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_inverted_index(articles):\n",
    "    \"\"\"\n",
    "    This function builds an inverted index as an hash table (dictionary)\n",
    "    where the keys are the terms and the values are ordered lists of\n",
    "    docIDs containing the term.\n",
    "    \"\"\"\n",
    "    index = defaultdict(set)\n",
    "    for docid, article in enumerate(articles):\n",
    "        for term in article:\n",
    "            index[term].add(docid)\n",
    "    return index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_positional_index(articles):\n",
    "    \"\"\"\n",
    "    A more advanced version of make_inverted_index. Here each posting is\n",
    "    non only a document id, but a list of positions where the term is\n",
    "    contained in the article.\n",
    "    \"\"\"\n",
    "    index = defaultdict(dict)\n",
    "    for docid, article in enumerate(articles):\n",
    "        for pos, term in enumerate(article):\n",
    "            try:\n",
    "                index[term][docid].append(pos)\n",
    "            except KeyError:\n",
    "                index[term][docid] = [pos]\n",
    "    return index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def documents_as_vectors(articles):\n",
    "    \"\"\"\n",
    "    Here we generate a list of dictionaries. Each element of the list\n",
    "    represents a document and each document has an associated dict where\n",
    "    to each term the corresponding tf-idf is associated. Since this function\n",
    "    creates a structure of size O(#documents \\times #terms), it can\n",
    "    be used only for small collections.\n",
    "    \"\"\"\n",
    "    p_index = make_positional_index(articles)\n",
    "    vectors = []\n",
    "    n = len(articles)\n",
    "    idf = {}\n",
    "    for term in p_index.keys():\n",
    "        idf[term] = log(n/len(p_index[term]))\n",
    "    for docid in range(0, len(articles)):\n",
    "        # We create a dictionary with a key for each dimension (term)\n",
    "        v = {}\n",
    "        for term in p_index.keys():\n",
    "            try:\n",
    "                tfidf = len(p_index[term][docid]) * idf[term]\n",
    "            except KeyError:\n",
    "                tfidf = 0\n",
    "            v[term] = tfidf\n",
    "        vectors.append(v)\n",
    "    return vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_document_vector(v, docid):\n",
    "    \"\"\"\n",
    "    This function prints, for a document represented as a vector in v, all the\n",
    "    non-zero weights (both normalized and not) and the corresponding terms\n",
    "    \"\"\"\n",
    "    non_zero_terms = [x for x in v[docid].keys() if v[docid][x] > 0]\n",
    "    vector = [(x, v[docid][x]) for x in non_zero_terms]\n",
    "    vector.sort(key=lambda x: x[1], reverse=True)\n",
    "    length = sqrt(sum([x[1]**2 for x in vector]))\n",
    "    normalized = {k: tfidf/length for k, tfidf in vector}\n",
    "    for (term, tfidf) in vector:\n",
    "        print(f\"{term}:\\t{tfidf}\\t(normalized: {normalized[term]})\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Example of usage\n",
    "articles = import_dataset()\n",
    "vectors = documents_as_vectors(articles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\" \".join(articles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "show_document_vector(vectors, 2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}