import requests_cache
import requests
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
from pandas.tools.plotting import scatter_matrix
import numpy as np
import seaborn as sns
from requests_oauthlib import OAuth1Session
import glob
import json
import os
import urllib2
from urllib2 import Request, urlopen
from requests_oauthlib import OAuth1Session
from bs4 import BeautifulSoup
import sqlite3 as sql
requests_cache.install_cache("cache")
# criteria: 1=Distance, 2=Highest Rated
def yelp_search(term, location, criteria):
"""
makes a search request
Argument: search term, location
Return: search result items as a list (for 0 items, return an empty list)
"""
consumer_key = "uFKYftN0fu4pD6guW2cYng"
consumer_secret = "iUYQ2Hk3-vTQVd0zOFnvloWPivA"
token = "FZnep4FiFmwrrifuNm6M-_K661Wb3J02"
token_secret = "CRvBo_fO-r4pE_-p2UrnN_mxBlc"
urlbase = "http://api.yelp.com/v2/search"
params = ({"term": term})
criteria = criteria.lower()
if criteria == "distance":
params.update({"location": location, "sort": "1", "cll" : "latitude,longitude"})
elif criteria == "highest rated":
params.update({"location": location, "sort": "2", "cll" : "latitude,longitude"})
else:
params.update({"location": location, "sort": "0", "cll" : "latitude,longitude"})
session = OAuth1Session(consumer_key, consumer_secret,token,token_secret)
request = session.get(urlbase,params=params)
#Transforms JSON API response into dictionary
data = request.json()
session.close()
output = data
if "businesses" not in output:
return {}
else:
return output["businesses"]
def extract(url):
"""
extract city information from a url
Argument: url
Return: a dictionary with info
"""
info = requests.get(url).content
wikisoup = BeautifulSoup(info, 'lxml')
city_data = wikisoup.findAll("th", {"scope": "row"})
population_data = wikisoup.findAll("td", {"align":"right"})
city = [city.text for city in city_data]
city = city[1:-8]
population_data = wikisoup.findAll("td", {"align":"right"})
population_data = "\n".join([population.text for population in population_data])
population_split = population_data.split("\n")
population = population_split[0::2]
info_dict = {"city": city, "city_population": population}
return info_dict
# extract city info from the wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_California"
url_output = extract(url)
pd_scrape = pd.DataFrame(url_output)
# this excel provide latitude and longitude information
pd_location = pd.read_excel("/Users/Chloechen/Downloads/calcities.xlsx", header = 0)
pd_location_1 = pd_location.rename(columns = {'Name':'city'})
pd_geo = pd.merge(pd_scrape, pd_location_1, on = "city")
pd_geo['city'] = pd_geo['city'] + ', CA'
conn = sql.connect("accommodation.sqlite")
pd_geo.to_sql("geo_info", conn, if_exists = 'replace', index = False)
def search(keyterm, city, criteria):
"""
create a dataframe with the yelp search result
Argument: keyterm(restaurants/hotels/nightlife/landmarks), city, criteria
Return: a dataframe
"""
result = yelp_search(keyterm, city, criteria)
df_result = pd.DataFrame(result)
df_result["city"] = city
return df_result
def create_df(keyterm, criteria):
"""
create a dataframe with the yelp search result for all the cities in California
Argument: keyterm(restaurants/hotels/nightlife/landmarks), criteria
Return: a dataframe
"""
yelp_result = [search(keyterm, "%s, CA" %city, criteria) for city in pd_scrape["city"]]
df_result = pd.concat(yelp_result, ignore_index = True)
df_result_final = pd.merge(df_result, pd_geo, on = "city", how = "inner")
df_result_final = df_result_final.drop(["deals", "gift_certificates", "is_claimed", "menu_date_updated", "menu_provider", "mobile_url", "phone", "rating_img_url", "rating_img_url_large","rating_img_url_small", "snippet_image_url", "location"], axis=1)
df_result_final = df_result_final.dropna()
# Must remove the square brackets twice
df_result_final["categories"] = df_result_final["categories"].str.get(0)
df_result_final["categories"] = df_result_final["categories"].str.get(0)
return df_result_final
def create_table(keyterm, criteria):
"""
create tables in the database
Argument: keyterm(restaurants/hotels/nightlife/landmarks), criteria
Return: tables in the database (No output, but tables are created)
"""
df_result_final = create_df(keyterm, criteria)
conn = sql.connect("accommodation.sqlite")
df_table = df_result_final.to_sql("Best_" + keyterm, conn, if_exists = 'replace', index = False)
return df_table
conn.close()
# create tables for restaurants, hotels and landmarks in accommodation.sqlite
travel_item = ["Restaurants", "Hotels", "Landmarks"]
for item in travel_item:
create_table(item, "best")