YelpReviewAnalyzer/restaurants.py at master · BayLadyCoder/YelpReviewAnalyzer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from beautifulSoupUtils import runBeautifulSoup

# ------- Home/Search Page ------- #
# searchURL = "https://www.yelp.com/search?find_desc=thai%20food&find_loc=owings%20mills"
# example = "https://www.yelp.com/search?find_desc=sushi&find_loc=baltimore"

# find url that have list of restaurants from the user search input
def searchListURL(find, near):
    Yelp = "https://www.yelp.com/search?find_desc="
    what = find.replace(' ', '%20')
    where = '&find_loc=' + near.replace(' ', '%20')
    searchURL = Yelp + what + where
    return searchURL


# find total pages of the list of restaurant
def findTotalRestaurantListPages(soup):
    totalPages = ''
    for div in soup.find_all('div', {'role':'navigation'}):
        for span in div.find_all("span", {'class':["lemon--span__373c0__3997G", "text__373c0__2pB8", "text-color--normal__373c0__K_MKN", "text-align--left__373c0__2pnx_"]}):
            if "Page" in span.text:
                #print(span.text)
                pageOfPages = span.text.strip()
                totalPages = pageOfPages[10:]
                #print(totalPages)
                totalPages = int(totalPages)

    return totalPages


# bundle functions for Flask after get user input (find and near)
def getListOfRestaurants(find, near):
    url = searchListURL(find, near)
    # url = 'http://127.0.0.1:5500/index.html'

    # soup = runBeautifulSoup(url)
    # totalListPages = findTotalRestaurantListPages(soup)
    # data = scrapRestaurantList(url, totalListPages)

    return scrapRestaurantList(url)


# Get a list of restaurants, their links(url), and numbers of reviews based on user searching input
def scrapRestaurantList(link, totalListPages = 1):
    page = 1
    startAt = 0
    data = []
    restaurants = {};
    # I commented out this below code because I only want to print list from the first page only
    # Otherwise it will take too much time and too many options for user (for now)

    # if totalListPages > 2:
    #     totalListPages = 2
    # startAt = start at restaurant number(1,30,60,90) this is for the url
    # while page <= totalListPages:
    #     if page == 1:
    #         url = link
    #     elif page > 1:
    #         startAt = (page-1)*30
    #         strStartAt = str(startAt)
    #         url = link+strStartAt
    #     pageOfPages = "(Page " + str(page) + " of " + str(totalListPages) + ")"
    #     print(url, pageOfPages)
    url = link
    print(f"restaurant list url: {url}")
    soup2 = runBeautifulSoup(url)

    for div in soup2.find_all('div', class_="css-1qn0b6x"):
        span = div.select_one(".css-chan6m")
        h3 = div.select_one("h3.css-1agk4wl")

        if h3 == None or span == None:
            continue

        a = h3.find('a',{'class':'css-19v1rkv'})

        restaurant = {"name": a.text, "link": a.get('href')}

        if restaurant.get('name') != "" and restaurant.get('link') != "" and span.text != '--:--':
            totalReviews = span.text.split(' ')[0][1:]
            if totalReviews[-1] == 'k':
                totalReviews = float(totalReviews[:-1]) * 1000;
            else:
                totalReviews = int(totalReviews)
            restaurant['review_counts'] = totalReviews
            restaurants[restaurant.get('name')] = restaurant;

    page += 1

    for attr, value in restaurants.items():
        data.append(value)


    return data