Skip to content
This repository was archived by the owner on Mar 8, 2021. It is now read-only.

Commit 99bf850

Browse files
committed
remove answers
1 parent 5c5e63b commit 99bf850

1 file changed

Lines changed: 15 additions & 50 deletions

File tree

Chapter 8 - PDF Search App.ipynb

Lines changed: 15 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"metadata": {
33
"name": "",
4-
"signature": "sha256:24992d3723674f6b6b156964e19fb9314512fb03f63b3f1d29be2b0f12a35f57"
4+
"signature": "sha256:848b292fb7835ca2ce202988038e9f1498a4b6ee1e21bbf3768481c16ef3ac39"
55
},
66
"nbformat": 3,
77
"nbformat_minor": 0,
@@ -416,8 +416,7 @@
416416
"input": [
417417
"from whoosh.query import Or \n",
418418
"\n",
419-
"query = Or([Term(\"text\", \"topic\"), Term(\"text\", \"model\")]) # insert your code here\n",
420-
"query"
419+
"# insert your code here"
421420
],
422421
"language": "python",
423422
"metadata": {},
@@ -434,8 +433,7 @@
434433
"cell_type": "code",
435434
"collapsed": false,
436435
"input": [
437-
"query = And([Term(\"author\", \"Trieschnigg\"), Term(\"text\", \"index\"), Term(\"text\", \"topic\")])\n",
438-
"query"
436+
"# insert your code here"
439437
],
440438
"language": "python",
441439
"metadata": {},
@@ -452,7 +450,7 @@
452450
"cell_type": "markdown",
453451
"metadata": {},
454452
"source": [
455-
"These query constructs are very explicit and clearn. It is, however, much more convenient to use Whoosh' `QuerParser` object to automatically parse strings into `Query` objects. We construct a query parser as follows:"
453+
"These query constructs are very explicit and clean. It is, however, much more convenient to use Whoosh' `QuerParser` object to automatically parse strings into `Query` objects. We construct a query parser as follows:"
456454
]
457455
},
458456
{
@@ -660,9 +658,6 @@
660658
"\n",
661659
"def pdftotext(pdf):\n",
662660
" # insert your code here\n",
663-
" basename, _ = os.path.splitext(os.path.basename(pdf))\n",
664-
" subprocess.call(['pdftotext', '-enc', 'UTF-8',\n",
665-
" pdf, os.path.join('data', basename + '.txt')])\n",
666661
"\n",
667662
"# if your answer is correct this should print the first 1000 bytes of the text file\n",
668663
"pdftotext(\"pdfs/blei2003.pdf\")\n",
@@ -785,13 +780,7 @@
785780
" and return a dictionary consisting of the author, title and \n",
786781
" text.\"\"\"\n",
787782
" basename, _ = os.path.splitext(os.path.basename(pdf))\n",
788-
" # insert your code here\n",
789-
" subprocess.call(['pdftotext', '-enc', 'UTF-8', '-htmlmeta',\n",
790-
" pdf, os.path.join('data', basename + '.html')])\n",
791-
" data = parse_html(os.path.join('data', basename + '.html'))\n",
792-
" with open(os.path.join('data', basename + '.txt'), 'w') as outfile:\n",
793-
" outfile.write(data['text'])\n",
794-
" return data"
783+
" # insert your code here"
795784
],
796785
"language": "python",
797786
"metadata": {},
@@ -822,14 +811,6 @@
822811
" with open(os.path.join('data', basename + '.txt'), 'w') as outfile:\n",
823812
" outfile.write(data['text'])\n",
824813
" # insert your code here\n",
825-
" os.remove(os.path.join('data', basename + '.html'))\n",
826-
" if not os.path.exists('static/pdfs'):\n",
827-
" os.mkdir('static/pdfs')\n",
828-
" shutil.copy(pdf, os.path.join('static/pdfs', basename + '.pdf'))\n",
829-
" data['source'] = os.path.join('static/pdfs', basename + '.pdf')\n",
830-
" data['target'] = os.path.join('data', basename + '.txt')\n",
831-
" data['id'] = basename\n",
832-
" return data\n",
833814
"\n",
834815
"pdftotext(\"pdfs/muellner2011.pdf\")"
835816
],
@@ -1003,7 +984,8 @@
1003984
" \"Main routine to index a collection of PDFs using Whoosh.\"\n",
1004985
" config = configparser.ConfigParser()\n",
1005986
" # read the configuration file\n",
1006-
" config.read(configpath) # remove from students version\n",
987+
" # insert your code here\n",
988+
" \n",
1007989
" recompile = config.getboolean(\"indexer.options\", \"recompile\")\n",
1008990
" # check whether the supplied index directory already exists\n",
1009991
" if not os.path.exists(config.get(\"filepaths\", \"index directory\")):\n",
@@ -1012,31 +994,29 @@
1012994
" index = create_in(config.get(\"filepaths\", \"index directory\"), schema=pdf_schema)\n",
1013995
" recompile = True\n",
1014996
" # open a connection to the index\n",
1015-
" index = open_dir(config.get(\"filepaths\", \"index directory\")) # remove from students version\n",
997+
" index = # insert your code here\n",
998+
" \n",
1016999
" # retrieve a set of all file IDs we already indexed\n",
10171000
" indexed = set(map(fileid, os.listdir(config.get(\"filepaths\", \"txt directory\"))))\n",
10181001
" # initialize a IndexWriter object\n",
1019-
" writer = index.writer() # remove from students version\n",
1002+
" writer = # insert your code here\n",
1003+
" \n",
10201004
" # iterate over all directories \n",
10211005
" for directory in config.get(\"filepaths\", \"pdf directory\").split(';'):\n",
10221006
" # iterate over all PDF files in this directory\n",
1023-
" for filepath in glob.glob(directory + \"/*.pdf\"): # remove from students version\n",
1007+
" for filepath in glob.glob(directory + \"/*.pdf\"):\n",
10241008
" # poor man's solution to check whether we already indexed this pdf\n",
10251009
" if fileid(filepath) not in indexed or recompile:\n",
10261010
" try:\n",
10271011
" # call the function pdftotext with the correct arguments\n",
1028-
" data = pdftotext( # remove from students version\n",
1029-
" filepath, \n",
1030-
" outdir=config.get(\"filepaths\", \"txt directory\"),\n",
1031-
" sourcedir=config.get(\"filepaths\", \"source directory\"),\n",
1032-
" p2t=config.get('programpaths', 'pdftotext'),\n",
1033-
" move=config.getboolean(\"indexer.options\", \"move pdfs\"))\n",
1012+
" data = # insert your code here\n",
1013+
" \n",
10341014
" # add the new document to the index\n",
10351015
" writer.add_document(**data)\n",
10361016
" except (IOError, UnicodeDecodeError) as error:\n",
10371017
" print(error)\n",
10381018
" # commit our changes\n",
1039-
" writer.commit() # remove from students version"
1019+
" # insert your code here"
10401020
],
10411021
"language": "python",
10421022
"metadata": {},
@@ -1197,12 +1177,6 @@
11971177
"\n",
11981178
"def search(query):\n",
11991179
" # insert your code here\n",
1200-
" index = open_dir(\"pdf-index\")\n",
1201-
" query = QueryParser(\"text\", index.schema).parse(query)\n",
1202-
" with index.searcher() as searcher:\n",
1203-
" results = searcher.search(query)\n",
1204-
" for hit in results:\n",
1205-
" yield dict(hit)\n",
12061180
" \n",
12071181
"print(list(search(\"(topic model) OR (index probability\")))"
12081182
],
@@ -1231,15 +1205,6 @@
12311205
"input": [
12321206
"def search(query):\n",
12331207
" # insert your code here\n",
1234-
" index = open_dir(\"pdf-index\")\n",
1235-
" query = QueryParser(\"text\", index.schema).parse(query)\n",
1236-
" with index.searcher() as searcher:\n",
1237-
" results = searcher.search(query)\n",
1238-
" for hit in results:\n",
1239-
" result = dict(hit)\n",
1240-
" with open(result['path']) as infile:\n",
1241-
" result['snippet'] = hit.highlights(\"text\", infile.read(), top=3)\n",
1242-
" yield result\n",
12431208
" \n",
12441209
"print(list(search(\"(topic model) OR (index probability\")))"
12451210
],

0 commit comments

Comments
 (0)