remove answers

fbkarsdorp · fbkarsdorp · commit 99bf850f2c4b · 2014-07-18T13:52:59.000+02:00
diff --git a/Chapter 8 - PDF Search App.ipynb b/Chapter 8 - PDF Search App.ipynb
@@ -1,7 +1,7 @@
 {
  "metadata": {
   "name": "",
-  "signature": "sha256:24992d3723674f6b6b156964e19fb9314512fb03f63b3f1d29be2b0f12a35f57"
+  "signature": "sha256:848b292fb7835ca2ce202988038e9f1498a4b6ee1e21bbf3768481c16ef3ac39"
  },
  "nbformat": 3,
  "nbformat_minor": 0,
@@ -416,8 +416,7 @@
      "input": [
       "from whoosh.query import Or \n",
       "\n",
-      "query = Or([Term(\"text\", \"topic\"), Term(\"text\", \"model\")]) # insert your code here\n",
-      "query"
+      "# insert your code here"
      ],
      "language": "python",
      "metadata": {},
@@ -434,8 +433,7 @@
      "cell_type": "code",
      "collapsed": false,
      "input": [
-      "query = And([Term(\"author\", \"Trieschnigg\"), Term(\"text\", \"index\"), Term(\"text\", \"topic\")])\n",
-      "query"
+      "# insert your code here"
      ],
      "language": "python",
      "metadata": {},
@@ -452,7 +450,7 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-      "These query constructs are very explicit and clearn. It is, however, much more convenient to use Whoosh' `QuerParser` object to automatically parse strings into `Query` objects. We construct a query parser as follows:"
+      "These query constructs are very explicit and clean. It is, however, much more convenient to use Whoosh' `QuerParser` object to automatically parse strings into `Query` objects. We construct a query parser as follows:"
      ]
     },
     {
@@ -660,9 +658,6 @@
       "\n",
       "def pdftotext(pdf):\n",
       "    # insert your code here\n",
-      "    basename, _ = os.path.splitext(os.path.basename(pdf))\n",
-      "    subprocess.call(['pdftotext', '-enc', 'UTF-8',\n",
-      "                     pdf, os.path.join('data', basename + '.txt')])\n",
       "\n",
       "# if your answer is correct this should print the first 1000 bytes of the text file\n",
       "pdftotext(\"pdfs/blei2003.pdf\")\n",
@@ -785,13 +780,7 @@
       "    and return a dictionary consisting of the author, title and \n",
       "    text.\"\"\"\n",
       "    basename, _ = os.path.splitext(os.path.basename(pdf))\n",
-      "    # insert your code here\n",
-      "    subprocess.call(['pdftotext', '-enc', 'UTF-8', '-htmlmeta',\n",
-      "                     pdf, os.path.join('data', basename + '.html')])\n",
-      "    data = parse_html(os.path.join('data', basename + '.html'))\n",
-      "    with open(os.path.join('data', basename + '.txt'), 'w') as outfile:\n",
-      "        outfile.write(data['text'])\n",
-      "    return data"
+      "    # insert your code here"
      ],
      "language": "python",
      "metadata": {},
@@ -822,14 +811,6 @@
       "    with open(os.path.join('data', basename + '.txt'), 'w') as outfile:\n",
       "        outfile.write(data['text'])\n",
       "    # insert your code here\n",
-      "    os.remove(os.path.join('data', basename + '.html'))\n",
-      "    if not os.path.exists('static/pdfs'):\n",
-      "        os.mkdir('static/pdfs')\n",
-      "    shutil.copy(pdf, os.path.join('static/pdfs', basename + '.pdf'))\n",
-      "    data['source'] = os.path.join('static/pdfs', basename + '.pdf')\n",
-      "    data['target'] = os.path.join('data', basename + '.txt')\n",
-      "    data['id'] = basename\n",
-      "    return data\n",
       "\n",
       "pdftotext(\"pdfs/muellner2011.pdf\")"
      ],
@@ -1003,7 +984,8 @@
       "    \"Main routine to index a collection of PDFs using Whoosh.\"\n",
       "    config = configparser.ConfigParser()\n",
       "    # read the configuration file\n",
-      "    config.read(configpath) # remove from students version\n",
+      "    # insert your code here\n",
+      "    \n",
       "    recompile = config.getboolean(\"indexer.options\", \"recompile\")\n",
       "    # check whether the supplied index directory already exists\n",
       "    if not os.path.exists(config.get(\"filepaths\", \"index directory\")):\n",
@@ -1012,31 +994,29 @@
       "        index = create_in(config.get(\"filepaths\", \"index directory\"), schema=pdf_schema)\n",
       "        recompile = True\n",
       "    # open a connection to the index\n",
-      "    index = open_dir(config.get(\"filepaths\", \"index directory\")) # remove from students version\n",
+      "    index = # insert your code here\n",
+      "    \n",
       "    # retrieve a set of all file IDs we already indexed\n",
       "    indexed = set(map(fileid, os.listdir(config.get(\"filepaths\", \"txt directory\"))))\n",
       "    # initialize a IndexWriter object\n",
-      "    writer = index.writer() # remove from students version\n",
+      "    writer = # insert your code here\n",
+      "    \n",
       "    # iterate over all directories \n",
       "    for directory in config.get(\"filepaths\", \"pdf directory\").split(';'):\n",
       "        # iterate over all PDF files in this directory\n",
-      "        for filepath in glob.glob(directory + \"/*.pdf\"): # remove from students version\n",
+      "        for filepath in glob.glob(directory + \"/*.pdf\"):\n",
       "            # poor man's solution to check whether we already indexed this pdf\n",
       "            if fileid(filepath) not in indexed or recompile:\n",
       "                try:\n",
       "                    # call the function pdftotext with the correct arguments\n",
-      "                    data = pdftotext( # remove from students version\n",
-      "                        filepath, \n",
-      "                        outdir=config.get(\"filepaths\", \"txt directory\"),\n",
-      "                        sourcedir=config.get(\"filepaths\", \"source directory\"),\n",
-      "                        p2t=config.get('programpaths', 'pdftotext'),\n",
-      "                        move=config.getboolean(\"indexer.options\", \"move pdfs\"))\n",
+      "                    data = # insert your code here\n",
+      "                    \n",
       "                    # add the new document to the index\n",
       "                    writer.add_document(**data)\n",
       "                except (IOError, UnicodeDecodeError) as error:\n",
       "                    print(error)\n",
       "    # commit our changes\n",
-      "    writer.commit() # remove from students version"
+      "    # insert your code here"
      ],
      "language": "python",
      "metadata": {},
@@ -1197,12 +1177,6 @@
       "\n",
       "def search(query):\n",
       "    # insert your code here\n",
-      "    index = open_dir(\"pdf-index\")\n",
-      "    query = QueryParser(\"text\", index.schema).parse(query)\n",
-      "    with index.searcher() as searcher:\n",
-      "        results = searcher.search(query)\n",
-      "        for hit in results:\n",
-      "            yield dict(hit)\n",
       "    \n",
       "print(list(search(\"(topic model) OR (index probability\")))"
      ],
@@ -1231,15 +1205,6 @@
      "input": [
       "def search(query):\n",
       "    # insert your code here\n",
-      "    index = open_dir(\"pdf-index\")\n",
-      "    query = QueryParser(\"text\", index.schema).parse(query)\n",
-      "    with index.searcher() as searcher:\n",
-      "        results = searcher.search(query)\n",
-      "        for hit in results:\n",
-      "            result = dict(hit)\n",
-      "            with open(result['path']) as infile:\n",
-      "                result['snippet'] = hit.highlights(\"text\", infile.read(), top=3)\n",
-      "            yield result\n",
       "    \n",
       "print(list(search(\"(topic model) OR (index probability\")))"
      ],