The following code is published for reproducibility purposes. It helps to collect and prepare data for Data Science Specialization Capstone Project.
Data for the analysis are provided by COursera in zip format at the following link.
In order to reproduce the analysis we require to download the zip file into data subfolder. The zip file contains 5 datasets in json format regarding:
- businesses
- reviews from 2004 to 2015
- users
- tips
- check-ins
The following packages are required:
- knitr
- plyr
- dplyr
- ggplot2
- rjson
- jsonlite
- lubridate
- xtable
- R.utils
- ggmap
In this section we will create the dataset for the analysis.
library ( rjson )
library ( jsonlite )
library ( knitr )
library ( plyr )
library ( dplyr )
library ( ggplot2 )
library ( lubridate )
library ( xtable )
library ( R.utils )
library( ggmap )
For our purposes we focus on business and reviews data, in particular on restaurants data.
Given the size of the file this operation can take a few minutes.
unzip(zipfile="data/yelp_dataset_challenge_academic_dataset.zip",overwrite = TRUE,junkpaths = TRUE,exdir="data")
json_data_business <- stream_in(file("data/yelp_academic_dataset_business.json"))
json_data_review <- stream_in(file("data/yelp_academic_dataset_review.json"))
After loading the data we can examine the content of the dataset.
Business data are treated as follows:
- We select columns from business data that will be used in our analysis
- attribution of main category
- attribution of subcategory for restaurants
- attribution of city
idx_selection <- which(colnames(json_data_business)%in%c("business_id","open","categories","city","longitude","latitude","reviews","stars"))
business_df <- json_data_business[idx_selection]
business_df <- tbl_df(business_df)
We focus on restaurants
rm(json_data_business)
gc()
idx_restaurants <- grep(pattern="Restaurants", x=business_df$categories , value=FALSE)
business_df <- business_df[idx_restaurants,]
Categories are a list in the business dataframe.
There are at most 10 categories per entry.
We separate it and convert it in a dataframe with 10 columns.
Then we substitute the original list with the dataframe.
business_categories <- business_df$categories
business_categories_df <- data.frame(do.call(rbind, business_categories))
colnames( business_categories_df ) <- paste0("subcategory_",1:10)
#Categories are recycled rowwise and have different levels
#we apply same levels to category factor
possible_categories <- unique(unlist(lapply(business_categories_df , levels)))
for( i in 1:ncol(business_categories_df) ){
business_categories_df[,i] <- factor( business_categories_df[,i] , levels= possible_categories )
}
# We trim the redundant categories per row. It may take a while.
categories_per_business <- unlist(lapply(business_categories, length ))
for( i in 1:length(categories_per_business) ){
if( categories_per_business[i]<10 ) business_categories_df[i,(categories_per_business[i]+1):10] <- NA else next
}
# substitute orginal variable categories with the dataframe
business_df$categories <- NULL
business_df <- cbind( business_df , business_categories_df )
Data regard 10 cities in US, Canada and Europe. The variable city in business dataframe is not useful to identify the macroarea. We cluster the restaurants using the coordinates.
#library(ggmap)
cities<-c('Edinburgh, UK', 'Karlsruhe, Germany', 'Montreal, Canada', 'Waterloo, Canada',
'Pittsburgh, PA', 'Charlotte, NC', 'Urbana-Champaign, IL', 'Phoenix, AZ', 'Las Vegas, NV', 'Madison, WI')
city.centres<-geocode(cities ,source = "google")
city.centres$city <- cities
set.seed(345)
geo.cluster<-kmeans(x=business_df[,c('longitude','latitude')],
centers= city.centres[,1:2])
business_df$city_name <- city.centres$city[geo.cluster$cluster]
We load reviews dataset and select columns of interest
idx_rev_selection <- which(colnames(json_data_review)%in%c("business_id","review_id","date"))
review_df <- json_data_review[idx_rev_selection]
review_df <- tbl_df(review_df)
# free memory
rm(json_data_review)
gc()
Now we add variables useful for analysis.
# add open and close dates
review_df$date <- as.POSIXct( review_df$date , format= "%Y-%m-%d",tz="GMT")
# filter reviews of restaurants
review_df <- semi_join(review_df, business_df , by="business_id")
# the start date is the first review of each business
business_start_date <- review_df %>%
select( business_id , date )%>%
group_by( business_id ) %>%
summarize(start_date = min( date ) )%>%
mutate( year_start_date= year(start_date) )
# There are 21799 business with start date
business_df <- inner_join(business_df, business_start_date , by="business_id")
# closing date is the max review date
review_df <- review_df %>%
mutate( year = year(date) )
max_date <- review_df %>%
select( business_id , date )%>%
group_by( business_id ) %>%
summarize( close_date= max(date) )%>%
mutate( year_close = year(close_date) )
business_df <- left_join( business_df, max_date , by="business_id")
# open business get close date 2016/12/31
business_df[business_df$open==TRUE,"close_date"] <- as.POSIXct("2016-12-31", format="%Y-%m-%d",tz="GMT")
business_df[business_df$open==TRUE,c("year_close")] <- c(2016)
For each business we select the main subcategory. We choose the subcategory giving priority to the more frequent.
subcategories <- business_df[,grep(pattern = "subcat", x=colnames(business_df))]
# remove Restaurants and Food
subcategories <- lapply( subcategories , function(x) gsub( pattern="Restaurants" , x= x , replacement = NA ) )
subcategories <- lapply( subcategories , function(x) gsub( pattern="^Food" , x= x , replacement = NA ) )
# create complete levels and sort levels per frequency
subcategories_levels <- levels(subcategories$subcategory_1)
levels_sorted<- names(sort( table( unlist( subcategories )) , decreasing= TRUE) )
subcategories <- lapply(subcategories, function(x) factor(x,levels=levels_sorted) )
subcategories_code <- as.data.frame( lapply( subcategories, function(x) as.numeric(x) ))
subcategories_code [is.na(subcategories_code)] <- 999
# min category is the one with higher frequency
first_subcategory <- apply( subcategories_code , 1, min )
first_subcategory <- levels_sorted[first_subcategory]
business_df$first_subcategory <-factor(first_subcategory)
# There are 349 restaurants without subcategory. We filter them out.
business_df <- business_df %>%
filter( !is.na(first_subcategory) )
# add year start date per city category
business_df <- business_df %>%
group_by(city_name , first_subcategory )%>%
mutate( category_start= min(year_start_date) )
We then add useful variables to reviews dataset.
# add subcategory and city name to reviews dataset
review_df <- inner_join(x=review_df,y=business_df[,c("business_id","first_subcategory","city_name")],by="business_id")
# add subcategory start_year
review_df <- inner_join(x=review_df,y=business_df[,c("business_id","city_name","category_start")],by=c("city_name","business_id") )
review_df <- review_df %>%
mutate(year_since_start=year-category_start)
Now we prepare the dataset that will be used in our analysis.
In order to compute share of reviews and share of businesses per type of cuisine we compute total reviews and total businesses per city/year.
# compute reviews per city category year
# It will be the main inout variable
reviews_per_city_year <- review_df %>%
group_by( city_name , first_subcategory , year ) %>%
summarize( reviews = n() )
# compute new businesses per city category year
business_open_per_city_year <- business_df %>%
group_by( city_name , first_subcategory , year_start_date ) %>%
summarize( business_open = n() )
colnames(business_open_per_city_year)[3] <- "year"
business_close_per_city_year <- business_df %>%
group_by( city_name , first_subcategory , year_close ) %>%
summarize( business_close = n() )
colnames(business_close_per_city_year)[3] <- "year"
We simplify the dataset for our analysis building a dataset without the business_id variable.
# every possible combination of city, category , year
complete_df <-
expand.grid( city_name= unique(business_df$city_name) ,
first_subcategory= unique( droplevels(business_df$first_subcategory) ) ,
year=unique(business_df$year_start_date) )
complete_df <- tbl_df(complete_df) # 13920x3
# remove NA subcategory and add total business open close and reviews
complete_df <- complete_df %>%
filter(!is.na(first_subcategory)) %>%
arrange(city_name,first_subcategory,year)
complete_df <- left_join(complete_df,business_open_per_city_year,by=c("city_name" , "first_subcategory" , "year") )
complete_df <- left_join(complete_df,business_close_per_city_year,by=c("city_name" , "first_subcategory" , "year") )
complete_df <- left_join(complete_df,reviews_per_city_year,by=c("city_name" , "first_subcategory" , "year") )
complete_df [is.na(complete_df)] <- 0
# compute existing businesses per city category at the eond of each year
complete_df <- complete_df %>%
group_by( city_name , first_subcategory ) %>%
mutate ( open_cum_business= cumsum(business_open) ,
close_cum_business= cumsum(business_close) ,
existing_business= open_cum_business-close_cum_business )
# remove cumulated count to keep the dataset simpler
complete_df$open_cum_business <- NULL
complete_df$close_cum_business <- NULL
# replace existing business 0 with reviews in the same year with business close
# because reviews concerned businesses actually closed
complete_df$existing_business[complete_df$existing_business==0 & complete_df$reviews>0] <- complete_df$business_close[complete_df$existing_business==0 & complete_df$reviews>0]
# compute share of reviews with respect to share of business
reviews_per_city_year <- review_df %>%
group_by( city_name , year ) %>%
summarise( tot_reviews_per_city = n() )
complete_df <- left_join(complete_df,reviews_per_city_year,by=c("city_name" , "year") )
business_per_city_year <- complete_df %>%
group_by( city_name , year ) %>%
summarise( businesses_per_city= sum(existing_business))
complete_df <- left_join(complete_df,business_per_city_year,by=c("city_name" , "year") )
complete_df$review_share <- complete_df$reviews/complete_df$tot_reviews_per_city
complete_df$business_share <- complete_df$existing_business/complete_df$businesses_per_city
complete_df$tot_reviews_per_city <- NULL
complete_df$businesses_per_city <- NULL
We clean the complete_df dataset
# 2015 has too few observation
# incomplete year
complete_df <- complete_df %>%
filter( year < 2015 )
# 12760 x 9
# remove not existing year per city
city_first_year <- complete_df %>%
filter ( reviews >=1 ) %>%
group_by( city_name ) %>%
summarise( first_year= min(year) )
complete_df <- inner_join( complete_df , city_first_year , by= "city_name" )
complete_df <- complete_df %>%
filter(year>=first_year) # 11020x10
We compute the number of existing businesses at the midpoint between the current and the previous year. This variable will be used to compute the ratio between revews share and business share
complete_df_previous <- complete_df %>%
select ( city_name , first_subcategory , year , existing_business )
complete_df_previous$year <- complete_df_previous$year+1
colnames(complete_df_previous)[-c(1:3)] <- paste(colnames(complete_df_previous)[-c(1:3)],"t1",sep="_" )
complete_df <- left_join(complete_df,complete_df_previous,by=c("city_name" , "first_subcategory" , "year") )
# we replace NAs of t1 values with end-of-period value
complete_df_first_values <- complete_df %>%
group_by ( city_name , first_subcategory ) %>%
mutate( first_year = min(year) ) %>%
filter ( year == first_year) %>%
select ( city_name , first_subcategory , year , existing_business )
# column integration with original and adjusted columns
x_values <- semi_join( complete_df, complete_df_first_values , by=c("city_name" , "first_subcategory" , "year") )
x_values$existing_business_t1 <- complete_df_first_values$existing_business
# remove original columns from complete_df
complete_df <- anti_join(complete_df,x_values,by=c("city_name" , "first_subcategory" , "year") )
complete_df <- rbind( complete_df , x_values ) # replace with adjusted subset
complete_df <- complete_df %>%
arrange(city_name,first_subcategory,year)
# review_business_share_ratio is computed with respect to businesses of previous year
# in order to compute the ratio reviews/existing restaurants
# I take the average value between existing_business_prev and existing_business
complete_df$business_midyear <- complete_df$existing_business_t1
complete_df$business_midyear[is.na(complete_df$business_midyear)] <- complete_df$existing_business[is.na(complete_df$business_midyear)]
complete_df$business_midyear <- (complete_df$existing_business+complete_df$existing_business_t1)/2
# if the number of businesses at the end of period is 1 then halfway is 1
complete_df$business_midyear[complete_df$business_midyear<1 & complete_df$business_midyear>0 ] <- 1
complete_df$business_midyear[complete_df$existing_business<2 & complete_df$existing_business_t1==0] <- 1
complete_df$business_midyear[complete_df$existing_business==0 & complete_df$existing_business_t1==0] <- 0
# some restaurants have open and close in same year
complete_df$business_midyear[complete_df$existing_business==0 & complete_df$existing_business_t1==0 ] <- complete_df$business_open[complete_df$existing_business==0 & complete_df$existing_business_t1==0 ]
# now I need to compute the business share of midpoints
business_mid_per_city_year <- complete_df %>%
group_by( city_name , year ) %>%
summarise( businesses_mid_per_city= sum(business_midyear) )
complete_df <- left_join(complete_df,business_mid_per_city_year,by=c("city_name" , "year") )
complete_df$business_midyear_share <- complete_df$business_midyear/complete_df$businesses_mid_per_city
complete_df$businesses_mid_per_city <- NULL
complete_df$review_business_share_ratio <- complete_df$review_share/complete_df$business_midyear_share
complete_df <- complete_df %>%
filter(!is.na(review_business_share_ratio)) # 4365
Then we add the variables for our analysis:
- SRvsSB vs historical mean at previous period
- business_share vs historical mean
# compute lagged values of review_business_share_ratio
SRvsSB_previous <- complete_df %>%
select ( city_name , first_subcategory , year, review_business_share_ratio )
SRvsSB_previous$year <- SRvsSB_previous$year+1
colnames(SRvsSB_previous)[-c(1:3)] <- paste(colnames(SRvsSB_previous)[-c(1:3)],"t1",sep="_" )
complete_df <- left_join(complete_df,SRvsSB_previous,by=c("city_name" , "first_subcategory" , "year") )
# historical average lagged values
complete_df_hist_avg <-complete_df %>%
arrange(city_name,first_subcategory,year)%>%
group_by(city_name,first_subcategory)%>%
mutate( business_share_hist = cummean(business_share) ,
SRvsSB_avg_hist = cummean( review_business_share_ratio ) ) %>%
select (city_name,first_subcategory,year, business_share_hist ,SRvsSB_avg_hist )
complete_df_hist_avg$year <- complete_df_hist_avg$year+1
colnames(complete_df_hist_avg)[-c(1:3)] <- paste(colnames(complete_df_hist_avg)[-c(1:3)],"t1",sep="_" )
# we add lagged values of business_share historical average
complete_df <- left_join(complete_df,complete_df_hist_avg[,1:4],by=c("city_name" , "first_subcategory" , "year") )
# historical average of SRvsSB_avg_hist is lagged another time to become comparable with review_business_share_ratio_t1
# this way we analyze change in business_share vs change in SRvsSB at previous period
complete_df_hist_avg$year <- complete_df_hist_avg$year+1
colnames(complete_df_hist_avg)[-c(1:3)] <- paste(colnames(complete_df_hist_avg)[-c(1:3)],"t2",sep="_" )
complete_df <- left_join(complete_df, complete_df_hist_avg[,c(1:3,5)] ,by=c("city_name" , "first_subcategory" , "year") )
colnames(complete_df)[17] <- "SRvsSB_avg_hist_t2"
colnames(complete_df)[14:17] <- c( "SRvsSB" , "SRvsSB_t1" , "business_share_avg_hist_t1" , "SRvsSB_avg_hist_t2" )
complete_df <- complete_df %>%
mutate( SRvsSB_vs_hist= SRvsSB_t1/SRvsSB_avg_hist_t2 ,
business_share_vs_hist = business_share/business_share_avg_hist_t1)
complete_df <- complete_df %>%
filter(!is.na(SRvsSB_vs_hist)) # 3095
Finally We export the datasets for analysis.
saveRDS(review_df , file= "data/review_df.rds")
saveRDS(business_df , file= "data/business_df.rds")
saveRDS(complete_df , file= "data/complete_df.rds")