Part I Overview:¶

Goal: Extract data from the Yelp API, clean up the data and create tables for the best(highest_rated and nearby) restaurants, hotels and landmarks

Steps:¶

Create functions to do natural language processing and create graphs
Analyze each tables in the acoommondation database

import requests_cache
import requests
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
from pandas.tools.plotting import scatter_matrix
import numpy as np
import seaborn as sns
from requests_oauthlib import OAuth1Session
import glob
import json
import os
import urllib2
from urllib2 import Request, urlopen
from requests_oauthlib import OAuth1Session
from bs4 import BeautifulSoup
import sqlite3 as sql

Data Wrangling¶

requests_cache.install_cache("cache")

# criteria: 1=Distance, 2=Highest Rated

def yelp_search(term, location, criteria):
    """
    makes a search request
    
    Argument: search term, location
    
    Return: search result items as a list (for 0 items, return an empty list)
    """
    consumer_key = "uFKYftN0fu4pD6guW2cYng"
    consumer_secret = "iUYQ2Hk3-vTQVd0zOFnvloWPivA"
    token = "FZnep4FiFmwrrifuNm6M-_K661Wb3J02"
    token_secret = "CRvBo_fO-r4pE_-p2UrnN_mxBlc"
   
    urlbase = "http://api.yelp.com/v2/search"
    params = ({"term": term})
    
    criteria = criteria.lower()
    if criteria == "distance":
        params.update({"location": location, "sort": "1", "cll" : "latitude,longitude"})
    elif criteria == "highest rated":
        params.update({"location": location, "sort": "2", "cll" : "latitude,longitude"})
    else:
        params.update({"location": location, "sort": "0", "cll" : "latitude,longitude"})
    session = OAuth1Session(consumer_key, consumer_secret,token,token_secret)
     
    request = session.get(urlbase,params=params)
   
    #Transforms JSON API response into dictionary
    data = request.json()
    session.close()
    output = data
    
    if "businesses" not in output:
        return {}
    
    else:
        return output["businesses"]

def extract(url):
    """
    extract city information from a url
    
    Argument: url
    
    Return: a dictionary with info
    """
    info = requests.get(url).content
    wikisoup = BeautifulSoup(info, 'lxml')
    city_data = wikisoup.findAll("th", {"scope": "row"})
    population_data = wikisoup.findAll("td", {"align":"right"})
    city = [city.text for city in city_data]
    city = city[1:-8]
    population_data = wikisoup.findAll("td", {"align":"right"})
    population_data = "\n".join([population.text for population in population_data])
    population_split = population_data.split("\n")
    population = population_split[0::2]
    info_dict = {"city": city, "city_population": population}
    return info_dict

# extract city info from the wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_California"
url_output = extract(url)
pd_scrape = pd.DataFrame(url_output)

# this excel provide latitude and longitude information
pd_location = pd.read_excel("/Users/Chloechen/Downloads/calcities.xlsx", header = 0)
pd_location_1 = pd_location.rename(columns = {'Name':'city'})
pd_geo = pd.merge(pd_scrape, pd_location_1, on = "city")
pd_geo['city'] = pd_geo['city'] + ', CA'

conn = sql.connect("accommodation.sqlite")

pd_geo.to_sql("geo_info", conn, if_exists = 'replace', index = False)

def search(keyterm, city, criteria):
    """
    create a dataframe with the yelp search result
    
    Argument: keyterm(restaurants/hotels/nightlife/landmarks), city, criteria
    
    Return: a dataframe
    """
    result = yelp_search(keyterm, city, criteria)
    df_result = pd.DataFrame(result)
    df_result["city"] = city
    return df_result

def create_df(keyterm, criteria):
    """
    create a dataframe with the yelp search result for all the cities in California
    
    Argument: keyterm(restaurants/hotels/nightlife/landmarks), criteria
    
    Return: a dataframe
    """
    yelp_result = [search(keyterm, "%s, CA" %city, criteria) for city in pd_scrape["city"]]
    df_result = pd.concat(yelp_result, ignore_index = True)
    df_result_final = pd.merge(df_result, pd_geo, on = "city", how = "inner")
    df_result_final = df_result_final.drop(["deals", "gift_certificates", "is_claimed", "menu_date_updated", "menu_provider", "mobile_url", "phone", "rating_img_url", "rating_img_url_large","rating_img_url_small", "snippet_image_url", "location"], axis=1)
    df_result_final = df_result_final.dropna()
    # Must remove the square brackets twice
    df_result_final["categories"] = df_result_final["categories"].str.get(0)
    df_result_final["categories"] = df_result_final["categories"].str.get(0)
    return df_result_final

def create_table(keyterm, criteria):
    """
    create tables in the database
    
    Argument: keyterm(restaurants/hotels/nightlife/landmarks), criteria
    
    Return: tables in the database (No output, but tables are created)
    """
    df_result_final = create_df(keyterm, criteria)
    conn = sql.connect("accommodation.sqlite")
    df_table = df_result_final.to_sql("Best_" + keyterm, conn, if_exists = 'replace', index = False)
    return df_table
    conn.close()

# create tables for restaurants, hotels and landmarks in accommodation.sqlite
travel_item = ["Restaurants", "Hotels", "Landmarks"]
for item in travel_item:
    create_table(item, "best")

/Users/Chloechen/anaconda/lib/python2.7/site-packages/pandas/core/generic.py:1165: UserWarning: The spaces in these column names will not be changed. In pandas versions < 0.14, spaces were converted to underscores.
  chunksize=chunksize, dtype=dtype)