-
Notifications
You must be signed in to change notification settings - Fork 35
Scrape categories and topics for news.py
#215
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,14 @@ | |
| class NewsTest(unittest.TestCase): | ||
| def __init__(self, *args, **kwargs): | ||
| unittest.TestCase.__init__(self, *args, **kwargs) | ||
| with (SAMPLE_PATH / "news_pittwire.html").open() as f: | ||
| self.pittwire = f.read() | ||
| with (SAMPLE_PATH / "news_pittwire_no_categories.html").open() as f: | ||
| self.pittwire_no_categories = f.read() | ||
| with (SAMPLE_PATH / "news_features_articles.html").open() as f: | ||
| self.features_articles = f.read() | ||
| with (SAMPLE_PATH / "news_features_articles_no_topics.html").open() as f: | ||
| self.features_articles_no_topics = f.read() | ||
| with (SAMPLE_PATH / "news_university_news_features_articles_page_0.html").open() as f: | ||
| self.university_news_features_articles_page_0 = f.read() | ||
| with (SAMPLE_PATH / "news_university_news_features_articles_page_1.html").open() as f: | ||
|
|
@@ -39,16 +47,77 @@ def __init__(self, *args, **kwargs): | |
| with (SAMPLE_PATH / "news_university_news_features_articles_2020.html").open() as f: | ||
| self.university_news_features_articles_2020 = f.read() | ||
|
|
||
| @responses.activate | ||
| def test_get_categories(self): | ||
| news.get_categories.cache_clear() | ||
| news._scrape_categories.cache_clear() | ||
| responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire) | ||
|
|
||
| categories = news.get_categories() | ||
|
|
||
| self.assertCountEqual( | ||
| categories, ["Features & Articles", "Accolades & Honors", "Ones to Watch", "Announcements and Updates"] | ||
| ) | ||
|
|
||
| @responses.activate | ||
| def test_get_categories_missing(self): | ||
| news.get_categories.cache_clear() | ||
| news._scrape_categories.cache_clear() | ||
| responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire_no_categories) | ||
|
|
||
| self.assertRaises(RuntimeError, news.get_categories) | ||
|
|
||
| @responses.activate | ||
| def test_get_topics(self): | ||
| news.get_topics.cache_clear() | ||
| news._scrape_topics.cache_clear() | ||
| responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles) | ||
|
|
||
| topics = news.get_topics() | ||
|
|
||
| self.assertCountEqual( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this test going to hold true even into the future?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, all the tests for the |
||
| topics, | ||
| [ | ||
| "University News", | ||
| "Health and Wellness", | ||
| "Technology & Science", | ||
| "Arts and Humanities", | ||
| "Community Impact", | ||
| "Innovation and Research", | ||
| "Global", | ||
| "Diversity, Equity, and Inclusion", | ||
| "Our City/Our Campus", | ||
| "Teaching & Learning", | ||
| "Space", | ||
| "Ukraine", | ||
| "Sustainability", | ||
| ], | ||
| ) | ||
|
|
||
| @responses.activate | ||
| def test_get_topics_missing(self): | ||
| news.get_topics.cache_clear() | ||
| news._scrape_topics.cache_clear() | ||
| responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles_no_topics) | ||
|
|
||
| self.assertRaises(RuntimeError, news.get_topics) | ||
|
|
||
| @responses.activate | ||
| def test_get_articles_by_topic(self): | ||
| news.get_categories.cache_clear() | ||
| news.get_topics.cache_clear() | ||
| news._scrape_categories.cache_clear() | ||
| news._scrape_topics.cache_clear() | ||
| responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire) | ||
| responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles) | ||
| responses.add( | ||
| responses.GET, | ||
| "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title=" | ||
| "&field_category_target_id=All", | ||
| body=self.university_news_features_articles_page_0, | ||
| ) | ||
|
|
||
| university_news_articles = news.get_articles_by_topic("university-news") | ||
| university_news_articles = news.get_articles_by_topic("University News") | ||
|
|
||
| self.assertEqual(len(university_news_articles), news.NUM_ARTICLES_PER_PAGE) | ||
| self.assertEqual( | ||
|
|
@@ -75,14 +144,20 @@ def test_get_articles_by_topic(self): | |
| @responses.activate | ||
| def test_get_articles_by_topic_query(self): | ||
| query = "fulbright" | ||
| news.get_categories.cache_clear() | ||
| news.get_topics.cache_clear() | ||
| news._scrape_categories.cache_clear() | ||
| news._scrape_topics.cache_clear() | ||
| responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire) | ||
| responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles) | ||
| responses.add( | ||
| responses.GET, | ||
| "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=" | ||
| f"&title={query}&field_category_target_id=All", | ||
| body=self.university_news_features_articles_fulbright, | ||
| ) | ||
|
|
||
| university_news_articles = news.get_articles_by_topic("university-news", query=query) | ||
| university_news_articles = news.get_articles_by_topic("University News", query=query) | ||
|
|
||
| self.assertEqual(len(university_news_articles), 3) | ||
| self.assertEqual( | ||
|
|
@@ -115,14 +190,20 @@ def test_get_articles_by_topic_query(self): | |
| @responses.activate | ||
| def test_get_articles_by_topic_year(self): | ||
| year = 2020 | ||
| news.get_categories.cache_clear() | ||
| news.get_topics.cache_clear() | ||
| news._scrape_categories.cache_clear() | ||
| news._scrape_topics.cache_clear() | ||
| responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire) | ||
| responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles) | ||
| responses.add( | ||
| responses.GET, | ||
| f"https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value={year}" | ||
| "&title=&field_category_target_id=All", | ||
| body=self.university_news_features_articles_2020, | ||
| ) | ||
|
|
||
| university_news_articles = news.get_articles_by_topic("university-news", year=year) | ||
| university_news_articles = news.get_articles_by_topic("University News", year=year) | ||
|
|
||
| self.assertEqual(len(university_news_articles), 5) | ||
| self.assertEqual( | ||
|
|
@@ -152,14 +233,20 @@ def test_get_articles_by_topic_year(self): | |
| @responses.activate | ||
| def test_get_articles_by_topic_less_than_one_page(self): | ||
| num_results = 5 | ||
| news.get_categories.cache_clear() | ||
| news.get_topics.cache_clear() | ||
| news._scrape_categories.cache_clear() | ||
| news._scrape_topics.cache_clear() | ||
| responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire) | ||
| responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles) | ||
| responses.add( | ||
| responses.GET, | ||
| "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title=" | ||
| "&field_category_target_id=All", | ||
| body=self.university_news_features_articles_page_0, | ||
| ) | ||
|
|
||
| university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results) | ||
| university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results) | ||
|
|
||
| self.assertEqual(len(university_news_articles), num_results) | ||
| self.assertEqual( | ||
|
|
@@ -186,6 +273,12 @@ def test_get_articles_by_topic_less_than_one_page(self): | |
| @responses.activate | ||
| def test_get_articles_by_topic_multiple_pages(self): | ||
| num_results = news.NUM_ARTICLES_PER_PAGE + 5 | ||
| news.get_categories.cache_clear() | ||
| news.get_topics.cache_clear() | ||
| news._scrape_categories.cache_clear() | ||
| news._scrape_topics.cache_clear() | ||
| responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire) | ||
| responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles) | ||
| responses.add( | ||
| responses.GET, | ||
| "https://www.pitt.edu/pittwire/news/features-articles?field_topics_target_id=432&field_article_date_value=&title=" | ||
|
|
@@ -199,7 +292,7 @@ def test_get_articles_by_topic_multiple_pages(self): | |
| body=self.university_news_features_articles_page_1, | ||
| ) | ||
|
|
||
| university_news_articles = news.get_articles_by_topic("university-news", max_num_results=num_results) | ||
| university_news_articles = news.get_articles_by_topic("University News", max_num_results=num_results) | ||
|
|
||
| self.assertEqual(len(university_news_articles), num_results) | ||
| self.assertEqual( | ||
|
|
@@ -227,3 +320,25 @@ def test_get_articles_by_topic_multiple_pages(self): | |
| ], | ||
| ), | ||
| ) | ||
|
|
||
| @responses.activate | ||
| def test_get_articles_by_topic_invalid_category(self): | ||
| news.get_categories.cache_clear() | ||
| news.get_topics.cache_clear() | ||
| news._scrape_categories.cache_clear() | ||
| news._scrape_topics.cache_clear() | ||
| responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire) | ||
| responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles) | ||
|
|
||
| self.assertRaises(ValueError, news.get_articles_by_topic, "University News", "Invalid Category") | ||
|
|
||
| @responses.activate | ||
| def test_get_articles_by_topic_invalid_topic(self): | ||
| news.get_categories.cache_clear() | ||
| news.get_topics.cache_clear() | ||
| news._scrape_categories.cache_clear() | ||
| news._scrape_topics.cache_clear() | ||
| responses.add(responses.GET, news.PITTWIRE_URL, body=self.pittwire) | ||
| responses.add(responses.GET, news.FEATURES_ARTICLES_URL, body=self.features_articles) | ||
|
|
||
| self.assertRaises(ValueError, news.get_articles_by_topic, "Invalid Topic") | ||
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are these version changes needed? You bump up the minimum Python version
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Most of them aren't necessary, but the
jinja2version change is necessary as a security update (the repo current has 4 Dependabot alerts for security vulnerabilities in the currentjinja2version). I had simply runpipenv updatewhen I was making my changes for this PR.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make the dependency changes in a different PR