|
1 | 1 | { |
2 | 2 | "metadata": { |
3 | 3 | "name": "", |
4 | | - "signature": "sha256:24992d3723674f6b6b156964e19fb9314512fb03f63b3f1d29be2b0f12a35f57" |
| 4 | + "signature": "sha256:848b292fb7835ca2ce202988038e9f1498a4b6ee1e21bbf3768481c16ef3ac39" |
5 | 5 | }, |
6 | 6 | "nbformat": 3, |
7 | 7 | "nbformat_minor": 0, |
|
416 | 416 | "input": [ |
417 | 417 | "from whoosh.query import Or \n", |
418 | 418 | "\n", |
419 | | - "query = Or([Term(\"text\", \"topic\"), Term(\"text\", \"model\")]) # insert your code here\n", |
420 | | - "query" |
| 419 | + "# insert your code here" |
421 | 420 | ], |
422 | 421 | "language": "python", |
423 | 422 | "metadata": {}, |
|
434 | 433 | "cell_type": "code", |
435 | 434 | "collapsed": false, |
436 | 435 | "input": [ |
437 | | - "query = And([Term(\"author\", \"Trieschnigg\"), Term(\"text\", \"index\"), Term(\"text\", \"topic\")])\n", |
438 | | - "query" |
| 436 | + "# insert your code here" |
439 | 437 | ], |
440 | 438 | "language": "python", |
441 | 439 | "metadata": {}, |
|
452 | 450 | "cell_type": "markdown", |
453 | 451 | "metadata": {}, |
454 | 452 | "source": [ |
455 | | - "These query constructs are very explicit and clearn. It is, however, much more convenient to use Whoosh' `QuerParser` object to automatically parse strings into `Query` objects. We construct a query parser as follows:" |
| 453 | + "These query constructs are very explicit and clean. It is, however, much more convenient to use Whoosh' `QuerParser` object to automatically parse strings into `Query` objects. We construct a query parser as follows:" |
456 | 454 | ] |
457 | 455 | }, |
458 | 456 | { |
|
660 | 658 | "\n", |
661 | 659 | "def pdftotext(pdf):\n", |
662 | 660 | " # insert your code here\n", |
663 | | - " basename, _ = os.path.splitext(os.path.basename(pdf))\n", |
664 | | - " subprocess.call(['pdftotext', '-enc', 'UTF-8',\n", |
665 | | - " pdf, os.path.join('data', basename + '.txt')])\n", |
666 | 661 | "\n", |
667 | 662 | "# if your answer is correct this should print the first 1000 bytes of the text file\n", |
668 | 663 | "pdftotext(\"pdfs/blei2003.pdf\")\n", |
|
785 | 780 | " and return a dictionary consisting of the author, title and \n", |
786 | 781 | " text.\"\"\"\n", |
787 | 782 | " basename, _ = os.path.splitext(os.path.basename(pdf))\n", |
788 | | - " # insert your code here\n", |
789 | | - " subprocess.call(['pdftotext', '-enc', 'UTF-8', '-htmlmeta',\n", |
790 | | - " pdf, os.path.join('data', basename + '.html')])\n", |
791 | | - " data = parse_html(os.path.join('data', basename + '.html'))\n", |
792 | | - " with open(os.path.join('data', basename + '.txt'), 'w') as outfile:\n", |
793 | | - " outfile.write(data['text'])\n", |
794 | | - " return data" |
| 783 | + " # insert your code here" |
795 | 784 | ], |
796 | 785 | "language": "python", |
797 | 786 | "metadata": {}, |
|
822 | 811 | " with open(os.path.join('data', basename + '.txt'), 'w') as outfile:\n", |
823 | 812 | " outfile.write(data['text'])\n", |
824 | 813 | " # insert your code here\n", |
825 | | - " os.remove(os.path.join('data', basename + '.html'))\n", |
826 | | - " if not os.path.exists('static/pdfs'):\n", |
827 | | - " os.mkdir('static/pdfs')\n", |
828 | | - " shutil.copy(pdf, os.path.join('static/pdfs', basename + '.pdf'))\n", |
829 | | - " data['source'] = os.path.join('static/pdfs', basename + '.pdf')\n", |
830 | | - " data['target'] = os.path.join('data', basename + '.txt')\n", |
831 | | - " data['id'] = basename\n", |
832 | | - " return data\n", |
833 | 814 | "\n", |
834 | 815 | "pdftotext(\"pdfs/muellner2011.pdf\")" |
835 | 816 | ], |
|
1003 | 984 | " \"Main routine to index a collection of PDFs using Whoosh.\"\n", |
1004 | 985 | " config = configparser.ConfigParser()\n", |
1005 | 986 | " # read the configuration file\n", |
1006 | | - " config.read(configpath) # remove from students version\n", |
| 987 | + " # insert your code here\n", |
| 988 | + " \n", |
1007 | 989 | " recompile = config.getboolean(\"indexer.options\", \"recompile\")\n", |
1008 | 990 | " # check whether the supplied index directory already exists\n", |
1009 | 991 | " if not os.path.exists(config.get(\"filepaths\", \"index directory\")):\n", |
|
1012 | 994 | " index = create_in(config.get(\"filepaths\", \"index directory\"), schema=pdf_schema)\n", |
1013 | 995 | " recompile = True\n", |
1014 | 996 | " # open a connection to the index\n", |
1015 | | - " index = open_dir(config.get(\"filepaths\", \"index directory\")) # remove from students version\n", |
| 997 | + " index = # insert your code here\n", |
| 998 | + " \n", |
1016 | 999 | " # retrieve a set of all file IDs we already indexed\n", |
1017 | 1000 | " indexed = set(map(fileid, os.listdir(config.get(\"filepaths\", \"txt directory\"))))\n", |
1018 | 1001 | " # initialize a IndexWriter object\n", |
1019 | | - " writer = index.writer() # remove from students version\n", |
| 1002 | + " writer = # insert your code here\n", |
| 1003 | + " \n", |
1020 | 1004 | " # iterate over all directories \n", |
1021 | 1005 | " for directory in config.get(\"filepaths\", \"pdf directory\").split(';'):\n", |
1022 | 1006 | " # iterate over all PDF files in this directory\n", |
1023 | | - " for filepath in glob.glob(directory + \"/*.pdf\"): # remove from students version\n", |
| 1007 | + " for filepath in glob.glob(directory + \"/*.pdf\"):\n", |
1024 | 1008 | " # poor man's solution to check whether we already indexed this pdf\n", |
1025 | 1009 | " if fileid(filepath) not in indexed or recompile:\n", |
1026 | 1010 | " try:\n", |
1027 | 1011 | " # call the function pdftotext with the correct arguments\n", |
1028 | | - " data = pdftotext( # remove from students version\n", |
1029 | | - " filepath, \n", |
1030 | | - " outdir=config.get(\"filepaths\", \"txt directory\"),\n", |
1031 | | - " sourcedir=config.get(\"filepaths\", \"source directory\"),\n", |
1032 | | - " p2t=config.get('programpaths', 'pdftotext'),\n", |
1033 | | - " move=config.getboolean(\"indexer.options\", \"move pdfs\"))\n", |
| 1012 | + " data = # insert your code here\n", |
| 1013 | + " \n", |
1034 | 1014 | " # add the new document to the index\n", |
1035 | 1015 | " writer.add_document(**data)\n", |
1036 | 1016 | " except (IOError, UnicodeDecodeError) as error:\n", |
1037 | 1017 | " print(error)\n", |
1038 | 1018 | " # commit our changes\n", |
1039 | | - " writer.commit() # remove from students version" |
| 1019 | + " # insert your code here" |
1040 | 1020 | ], |
1041 | 1021 | "language": "python", |
1042 | 1022 | "metadata": {}, |
|
1197 | 1177 | "\n", |
1198 | 1178 | "def search(query):\n", |
1199 | 1179 | " # insert your code here\n", |
1200 | | - " index = open_dir(\"pdf-index\")\n", |
1201 | | - " query = QueryParser(\"text\", index.schema).parse(query)\n", |
1202 | | - " with index.searcher() as searcher:\n", |
1203 | | - " results = searcher.search(query)\n", |
1204 | | - " for hit in results:\n", |
1205 | | - " yield dict(hit)\n", |
1206 | 1180 | " \n", |
1207 | 1181 | "print(list(search(\"(topic model) OR (index probability\")))" |
1208 | 1182 | ], |
|
1231 | 1205 | "input": [ |
1232 | 1206 | "def search(query):\n", |
1233 | 1207 | " # insert your code here\n", |
1234 | | - " index = open_dir(\"pdf-index\")\n", |
1235 | | - " query = QueryParser(\"text\", index.schema).parse(query)\n", |
1236 | | - " with index.searcher() as searcher:\n", |
1237 | | - " results = searcher.search(query)\n", |
1238 | | - " for hit in results:\n", |
1239 | | - " result = dict(hit)\n", |
1240 | | - " with open(result['path']) as infile:\n", |
1241 | | - " result['snippet'] = hit.highlights(\"text\", infile.read(), top=3)\n", |
1242 | | - " yield result\n", |
1243 | 1208 | " \n", |
1244 | 1209 | "print(list(search(\"(topic model) OR (index probability\")))" |
1245 | 1210 | ], |
|
0 commit comments