Skip to content
This repository was archived by the owner on Jan 7, 2021. It is now read-only.

Commit 3bc1340

Browse files
committed
Attempt at patching the get_page_text bug identified by @aboutaaron in #94. Also added a unittest to make sure it's working in the future.
1 parent 58bf180 commit 3bc1340

2 files changed

Lines changed: 12 additions & 1 deletion

File tree

documentcloud/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,7 @@ def get_page_text_url(self, page):
745745
"""
746746
template = self.resources.page.get('text')
747747
url = template.replace("{page}", str(page))
748-
return self._get_url(url)
748+
return url
749749

750750
def get_page_text(self, page):
751751
"""

test.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,17 @@ def test_public_actions(self):
237237
# for all the old documents in the database.
238238
#self.assertEqual(hashlib.sha1(pdf).hexdigest(), obj.file_hash)
239239

240+
# Text
241+
self.assertEqual(
242+
obj.get_page_text_url(1),
243+
'https://www.documentcloud.org/documents/74103/pages/\
244+
report-of-the-calpers-special-review-p1.txt'
245+
)
246+
self.assertEqual(
247+
document.get_page_text(1).split("\n")[0].strip(),
248+
"Report of the CalPERS Special Review"
249+
)
250+
240251
# Images
241252
self.assertTrue(len(obj.small_image) > 0)
242253
self.assertTrue(len(obj.thumbnail_image) > 0)

0 commit comments

Comments
 (0)