Goal: create different plots for analysis for the categories: Restaurant, hotel, and landmarks
import sqlite3 as sql
import pandas as pd
import numpy as np
import plotly
import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *
import plotly.graph_objs as go
import colorlover as cl
from collections import Counter
tls.set_credentials_file(
username = 'stephanielam',
api_key = 'MYHCKKgTiB3tGETcNcLB')
mapbox_access_token = 'pk.eyJ1Ijoic3RhMTQxcHJvamVjdCIsImEiOiJjaXpxYWUyOHEwMTZ1MzNuNHp6b21zY2Y5In0.Vr6h57ZVuRRgfJ4RwW9wGw'
def get_df(keyterm):
conn = sql.connect("accommodation.sqlite")
df_graph = pd.read_sql("SELECT * FROM Best_" + keyterm, conn)
return df_graph
df_food_final = get_df("Restaurants")
df_food_final.head()
def freq_cat(df):
"""
get the Top 10 most frequent category labels (Overall)
argument: dataframe
return: dataframe of top 10 category
"""
count = []
for i in range(0,len(df)):
data = df['categories'][i]
count.append(data)
freq = Counter(count)
count_df = pd.DataFrame(freq.most_common()[0:10])
return count_df
def freq_bar_plot(df, keyterm, colors):
"""
Argument: dataframe, keyterm, colors
Return: bar plot of the top 10 most frequent category
"""
data = [go.Bar(x= count_df[0],y= count_df[1],
marker=dict(color = colors))]
layout = go.Layout(title='Top 10 Most Frequent Category of %s' %keyterm)
fig = go.Figure(data=data, layout=layout)
return py.iplot(fig, filename = 'Top 10 most frequent category of %s' %keyterm)
#get the most frequent category labels
count_df = freq_cat(df_food_final)
#most frequent category for Restaurant
colors = cl.scales['10']['div']['RdYlBu']
freq_bar_plot(count_df, "Restaurant", colors)
From the bar plot, we can see that Mexican food is the most popular.
The functions below are for having a better understanding of how the categories are distributed in differnt rating.
def get_count(category, df):
"""count the rating
return: the labels and values"""
count_list =[]
for i in range(len(df)):
if df['categories'][i] == category:
count_list.append(df['rating'][i])
labels, values = zip(*Counter(count_list).items())
count_list =[]
return labels, values
def get_cat_count(dataframe, df):
"""loop through the categories and get all the rating count by category"""
cat_count = []
for i in range(0, len(dataframe[0])):
count = get_count(dataframe[0][i], df)
cat_count.append(count)
return cat_count
def star_dist_plot(df, keyterm):
"""stack plot of the star distribution(by category)"""
count_df = freq_cat(df)
cat_count = get_cat_count(count_df, df)
trace = dict()
for i in range(0, len(cat_count)):
trace[i] = go.Bar(x = cat_count[i][0], y=cat_count[i][1], name= count_df[0][i])
data = [trace[i] for i in trace]
layout = go.Layout(barmode = 'stack', title ='Rating stars distribution (by categeory) for %s' %keyterm
)
fig = go.Figure(data=data, layout=layout)
return py.iplot(fig, filename='Rating stars distribution (by categeory) for %s' %keyterm)
#stack plot
star_dist_plot(df_food_final, "Restaurant")
We can see that most of the restaurants have 4 stars rating or higher, so we can tell that, CA overall has pretty posh restaurants. Note that out of all the thai restaurants, there is 0 of them got a 5 star rating.
def average_city(dataframe, category):
"""get the average star rating (by City)"""
average_rate = dataframe.groupby('city')[category].mean()
average_rate = average_rate.reset_index()
average_rate = pd.DataFrame(average_rate)
return average_rate
def city_rating(df):
"""return dataframe of both average rating and review count"""
rating_star = average_city(df, 'rating')
average_review = average_city(df, 'review_count')
rating_star = pd.merge(rating_star, average_review, on= 'city')
#subset the city, latitude, and longitude from the df and create new df
pd_geo = df[['city','Latitude ','Longitude']]
pd_geo = pd_geo.drop_duplicates()
pd_geo = pd_geo.reset_index()
#combine the dataframes together by city name
rating_star = pd.merge(rating_star, pd_geo, on='city')
return rating_star
def city_rating_map(df, keyterm, scale):
"""
function created to plot the map of average rating by city
"""
df['text'] = 'City: ' + df['city'] + ' <br>Rating: ' + df['rating'].astype(np.str) +' star' + '<br>Average Reviews count: ' + df['review_count'].astype(np.str)
limits = [(1,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5)]
colors = ["lightgrey","lightgreen","lightblue","lightyellow","red"]
cities = []
for i in range(len(limits)):
lim = limits[i]
rate_sub = df.query('%f' %lim[0] +'<= rating < %f' %lim[1])
city = dict(
type = 'scattergeo',
locationmode = 'USA-states',
lon = rate_sub['Longitude'],
lat = rate_sub['Latitude '],
text = rate_sub['text'],
marker = dict(
size = rate_sub['review_count'] / scale,
color = colors[i],
line = dict(width=0.5, color='rgb(40,40,40)'),
sizemode = 'area'
),
name = '{0} - {1} stars'.format(lim[0],lim[1]))
cities.append(city)
layout = dict(
title = '%s Average Rating by City<br>(Click legend to toggle traces)'%keyterm,
showlegend = True,
geo = dict(
scope='usa',
projection=dict( type='albers usa' ),
showland = True,
landcolor = 'rgb(217, 217, 217)',
subunitwidth=1,
countrywidth=1,
subunitcolor="rgb(255, 255, 255)",
countrycolor="rgb(255, 255, 255)",
lonaxis = dict( range= [ -124.5, -114.0 ] ),
lataxis = dict( range= [ 32.0, 42.0 ] )
),
)
fig = dict( data=cities, layout=layout )
return py.iplot(fig, validate=False, filename='%s average rating by city'%keyterm)
rating_star = city_rating(df_food_final)
city_rating_map(rating_star,'Restaurant', 2.5)
In the map, the bigger dots indicate higher average review count. we see that most of the restaurants in different cities are between 4 and 4.5 stars. And there are currently no city that has a average rating higher than 4.5.
def limit_data(df):
"""restrict to top 10 category """
restricted_data = pd.DataFrame()
for i in range(0, len(df)):
for j in range(0, len(count_df)):
# restrict to top 10 category
if df['categories'][i] == count_df[0][j]:
data = df.iloc[i]
restricted_data = restricted_data.append(data ,ignore_index=True)
return restricted_data
restricted_food = limit_data(df_food_final)
We want the users to be able to look at where are the top 3 restaurants located by the category they want. We created a function to extract the top 3 restaurants by categories and then plot the locations on a map.
def top3by_cat(category, df, keyterm, number):
"""top 3 restaurant by category"""
temp = pd.DataFrame()
for i in range(0, len(df)):
if keyterm == 'restaurant':
cat = df['categories'][i]
elif keyterm == 'hotels':
cat = df['type'][i]
elif keyterm == 'landmarks':
continue
if cat == category:
data = df.iloc[i]
temp = temp.append(data, ignore_index=True)
best_3 = temp.groupby('rating')['review_count'].nlargest(10).reset_index()
best_3 = pd.DataFrame(best_3)
df_best3 = pd.DataFrame()
for i in range(0,len(best_3)):
if best_3['rating'][i] == 5.0:
top = temp.iloc[[best_3['level_1'][i]]]
df_best3 = pd.concat([top,df_best3],ignore_index = True)
df_best3 = df_best3.drop_duplicates(['name'])
if len(df_best3) == number:
break
if len(df_best3) < number:
for i in range(0,len(best_3)):
if best_3['rating'][i] == 4.5:
top = temp.iloc[[best_3['level_1'][i]]]
df_best3 = pd.concat([top,df_best3],ignore_index = True)
df_best3 = df_best3.drop_duplicates(['name'])
if len(df_best3) == number:
break
return df_best3
def top3_map(categories, df, keyterm,number):
"""plot the map of the top 3 restaurant for each category"""
trace = dict()
color = cl.scales['10']['div']['Spectral']
for i in range(0, len(categories)):
df_best3 = top3by_cat(categories[i], df, keyterm, number)
df_best3 = df_best3.iloc[::-1]
df_best3['text'] = 'Name: ' + df_best3['name'] + '<br>City: ' + df_best3['city'] +'<br>Rating: ' + df_best3['rating'].astype(np.str)+ ' star' + '<br>Reviews count: ' + df_best3['review_count'].astype(np.str)
trace[i] = Scattermapbox(
lat=df_best3['Latitude '],
lon=df_best3['Longitude'],
mode='markers',
marker = Marker(
size=15,
color= color[i],
opacity=0.7),
text=df_best3['text'],
name = '%s' %categories[i]
)
data = Data([trace[i] for i in trace])
layout = Layout(
title = 'Top 3 %s in California by category'%keyterm,
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=36.5,
lon=-120
),
pitch=0,
zoom=4.8
),
)
fig = dict(data=data, layout=layout)
return py.iplot(fig, filename='Top 5 %s by category'%keyterm, validate=False)
top3_map(count_df[0], restricted_food, 'restaurant', 3)
Most of the top 3 restaurants are located near San Francisco and the Los Angeles to San Diego area.
from PIL import Image
import urllib, cStringIO
from IPython.display import display
def display_img(category, df, keyterm, number):
category = unicode(category)
"""image of the top 3 restaurants by category """
image_url = []
for i in range(0, 3):
dfbest3 = top3by_cat(category, df, keyterm,number)
dfbest3 = dfbest3.iloc[::-1]
url = dfbest3['image_url'][i]
image_url.append(str(url))
print 'Image of the top 3 %s restaurants:' %category
for image in image_url :
link = cStringIO.StringIO(urllib.urlopen(image).read())
img = Image.open(link)
display(img)
return
#For example:
display_img('Mexican', restricted_food, 'restaurant',3)
df_hotels_final = get_df("Hotels")
df_hotels_final.head()
#get the most frequent category labels
count_df = freq_cat(df_hotels_final)
colors = ['rgba(222,45,38,0.8)', 'grey', 'grey', 'grey', 'grey', 'grey', 'grey', 'grey','grey', 'grey']
freq_bar_plot(count_df, 'Hotels', colors)
From the dataframe we got from yelp, the Hotel category doesn't contain much varieties of category. Therefore, to better analysis the hotel data, we will first restrict the data to those that are indeed Hotels, and look at the pie chart of the star distribution and map of average rating in each city
def restrict_df(df, category):
"""limit the dataframe to only the category we want"""
df_limit = pd.DataFrame()
for i in range(0, len(df)):
if df['categories'][i] == category:
df_limit = pd.concat([df.iloc[[i]],df_limit], ignore_index = True)
return df_limit
category = u'Hotels'
df_limit_hotels = restrict_df(df_hotels_final, category)
def star_pie(df, keyterm):
"""star distribution in the category 'Hotels'"""
labels, values = zip(*Counter(df).items())
fig = {'data': [{'labels': labels,'values': values,'type': 'pie'}],
'layout': {'title': "Pie chart of star distribution in the category '%s'" %keyterm}}
return py.iplot(fig, filename = "star distribution in the category %s" %keyterm)
star_pie(df_limit_hotels['rating'], 'Hotels')
From the pie chart, we can see that the majority of hotels are between 3 to 4 stars. There is only 3.04% of hotels are rated 5 stars while 26.7% are 3.5 stars.
hotel_rating = city_rating(df_limit_hotels)
city_rating_map(hotel_rating,'Hotels', 1)
Compared to the restaurants category, there are actually quite a lot of cities that has a average hotel rating between 1 - 3 stars. The only 2 red dots in the map: the dot in Exeter is very small and the average reviews count is only 14, indicating that even though it has a average 4.5 rating in the city, there might be just one hotel in the city. On the other hand, the hotels in Palm Spring is very nice because the average review count is very high.
To find the top hotels in California, we wanted the user to be able to look at the top of different kinds of hotel such as hotel, motel, inn, and etc. Therefore, we group the hotels into different categories by finding the pattern within their name.
def find_pattern(kind, df):
"""find the pattern in names"""
frame = pd.DataFrame()
for i in range(0, len(df)):
if kind in df['name'][i].lower():
temp = df.iloc[i]
frame = frame.append(temp)
frame = pd.DataFrame(frame)
return frame
hotel_group = pd.DataFrame()
kinds = [u'hotel', u'motel', u'inn', u'suites', u'lodge', u'resort']
for i in range(0, len(kinds)):
group = find_pattern(kinds[i], df_limit_hotels)
group['type'] = kinds[i]
#create new dataframe
hotel_group = pd.concat([hotel_group, group], ignore_index = True)
top3_map(kinds, hotel_group, 'hotels', 3)
Same as the restaurant category, most top hotels are located near San Francisco and LA-SD area
df_landmarks_final = get_df("Landmarks")
#get the most frequent category labels
count_df = freq_cat(df_landmarks_final)
#plot of the most frequent category for Landmarks
colors = ['rgba(222,45,38,0.8)', 'grey', 'grey', 'grey', 'grey', 'grey', 'grey', 'grey','grey', 'grey']
freq_bar_plot(count_df, 'Landmarks', colors)
category = u'Landmarks & Historical Buildings'
df_limit_landmarks = restrict_df(df_landmarks_final, category)
star_pie(df_limit_landmarks['rating'], 'Landmarks')
From the pie chart, most landmarks seem to have high rating, ranged from 4 to 5 stars.
landmark_rating = city_rating(df_limit_landmarks)
city_rating_map(landmark_rating,'Landmarks', 1)
From the previous plot and this map, we see that landmarks have dramatically more cities that are highly-rated. And most of them also have high average review count.
df_limit_landmarks['name'][0:10]
Since we focused on only the landmarks and historical building category and there is no pattern in their names, we will find and plot the top 10 landmarks in California instead.
def find_best_10(data, df):
"""get the best 10"""
df_best_10 = pd.DataFrame()
for i in range(0,len(data)):
if data['rating'][i] == 5.0:
top = df.iloc[[data['level_1'][i]]]
df_best_10 = pd.concat([top,df_best_10],ignore_index = True)
df_best_10 = df_best_10.drop_duplicates(['name'])
if len(df_best_10) == 10:
break
if len(df_best_10) < 10:
for i in range(0, len(data)):
if data['rating'][i] == 4.5:
top = df.iloc[[data['level_1'][i]]]
df_best_10 = pd.concat([top,df_best_10],ignore_index = True)
df_best_10 = df_best_10.drop_duplicates(['name'])
if len(df_best_10) == 10:
break
return df_best_10
def best10(df):
"""dataframe of the top 10 landmarks"""
best_10 = df.groupby('rating')['review_count'].nlargest(20).reset_index()
best_10 = pd.DataFrame(best_10)
df_best_10 = find_best_10(best_10, df)
df_best_10 = df_best_10.iloc[::-1]
return df_best_10
df_best_10 = best10(df_limit_landmarks)
df_best_10.head()
df_landmark_best_10 = df_best_10.reset_index()
def best_10_map(dataframe, keyterm):
"""map of the top 10 landmarks"""
dataframe['text'] = 'Name: ' + dataframe['name'] + '<br>City:' + dataframe['city'] +'<br>Rating: ' + dataframe['rating'].astype(np.str)+ ' star' + '<br>Reviews count: ' + dataframe['review_count'].astype(np.str)
color = cl.scales['10']['div']['Spectral']
data = Data([
Scattermapbox(
lat=dataframe['Latitude '],
lon=dataframe['Longitude'],
mode='markers',
marker=Marker(
size=17,
color=[color[i] for i in range(0, len(color))],
opacity=0.7
),
text=dataframe['text'],
)
])
layout = Layout(
title = 'Top 10 %s in California'%keyterm,
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_access_token,
bearing=0,
center=dict(
lat=37,
lon=-122
),
pitch=0,
zoom=4.2
),
)
fig = dict(data=data, layout=layout)
return py.iplot(fig, filename='Top 10 %s'%keyterm, validate=False)
best_10_map(df_landmark_best_10, 'Landmarks')
Different than the restaurant and hotel category, out of the 10 top landmarks, half of them are located in San Francisco while Los Angeles does not have any.