DSS Capstone - Data Preparation

The following code is published for reproducibility purposes. It helps to collect and prepare data for Data Science Specialization Capstone Project.

Data for the analysis are provided by COursera in zip format at the following link.
In order to reproduce the analysis we require to download the zip file into data subfolder. The zip file contains 5 datasets in json format regarding:
- businesses
- reviews from 2004 to 2015
- users
- tips
- check-ins

The following packages are required:
- knitr
- plyr
- dplyr
- ggplot2
- rjson
- jsonlite
- lubridate
- xtable
- R.utils
- ggmap

In this section we will create the dataset for the analysis.

  library ( rjson )
  library ( jsonlite )
  library ( knitr )
  library ( plyr )  
  library ( dplyr )
  library ( ggplot2 )
  library ( lubridate )
  library ( xtable )
  library ( R.utils )
  library( ggmap )

For our purposes we focus on business and reviews data, in particular on restaurants data.
Given the size of the file this operation can take a few minutes.

  unzip(zipfile="data/yelp_dataset_challenge_academic_dataset.zip",overwrite = TRUE,junkpaths = TRUE,exdir="data")
  json_data_business <- stream_in(file("data/yelp_academic_dataset_business.json"))
  json_data_review <- stream_in(file("data/yelp_academic_dataset_review.json"))

After loading the data we can examine the content of the dataset.
Business data are treated as follows:
- We select columns from business data that will be used in our analysis
- attribution of main category
- attribution of subcategory for restaurants
- attribution of city

    idx_selection <- which(colnames(json_data_business)%in%c("business_id","open","categories","city","longitude","latitude","reviews","stars"))
    business_df <- json_data_business[idx_selection]
    business_df <- tbl_df(business_df)

We focus on restaurants

    rm(json_data_business)
    gc()
    idx_restaurants <- grep(pattern="Restaurants", x=business_df$categories , value=FALSE)
    business_df <- business_df[idx_restaurants,]

Categories are a list in the business dataframe.
There are at most 10 categories per entry.
We separate it and convert it in a dataframe with 10 columns.
Then we substitute the original list with the dataframe.

   business_categories <- business_df$categories
   business_categories_df <- data.frame(do.call(rbind, business_categories))
   colnames( business_categories_df ) <- paste0("subcategory_",1:10)

   #Categories are recycled rowwise and have different levels
   #we apply same levels to category factor
    possible_categories <- unique(unlist(lapply(business_categories_df , levels)))
    for( i in 1:ncol(business_categories_df) ){
      business_categories_df[,i] <- factor( business_categories_df[,i] , levels= possible_categories ) 
    }
    
    # We trim the redundant categories per row. It may take a while.
   categories_per_business <- unlist(lapply(business_categories, length ))
    for( i in 1:length(categories_per_business) ){
     if( categories_per_business[i]<10 )  business_categories_df[i,(categories_per_business[i]+1):10] <- NA else next
    }
   #  substitute orginal variable categories with the dataframe
   business_df$categories <- NULL
   business_df <- cbind( business_df , business_categories_df )

Data regard 10 cities in US, Canada and Europe. The variable city in business dataframe is not useful to identify the macroarea. We cluster the restaurants using the coordinates.

  #library(ggmap)
    cities<-c('Edinburgh, UK', 'Karlsruhe, Germany', 'Montreal, Canada', 'Waterloo, Canada', 
              'Pittsburgh, PA', 'Charlotte, NC', 'Urbana-Champaign, IL', 'Phoenix, AZ', 'Las Vegas, NV', 'Madison, WI')
    city.centres<-geocode(cities ,source = "google")
    city.centres$city <- cities
    
    set.seed(345)
    geo.cluster<-kmeans(x=business_df[,c('longitude','latitude')],
                        centers= city.centres[,1:2])
   
    business_df$city_name <- city.centres$city[geo.cluster$cluster]

We load reviews dataset and select columns of interest

    idx_rev_selection <- which(colnames(json_data_review)%in%c("business_id","review_id","date"))
    review_df <- json_data_review[idx_rev_selection]
    review_df <- tbl_df(review_df)
    # free memory
    rm(json_data_review)
    gc()

Now we add variables useful for analysis.

# add open and close dates
    review_df$date <-   as.POSIXct( review_df$date , format= "%Y-%m-%d",tz="GMT") 
      
# filter reviews of restaurants    
    review_df <- semi_join(review_df, business_df , by="business_id")
      
# the start date is the first review of each business
      business_start_date <- review_df %>%
                                select( business_id , date )%>%
                                group_by( business_id ) %>%
                                summarize(start_date = min( date ) )%>%
                                mutate(  year_start_date= year(start_date) )

# There are 21799 business with start date
      business_df <- inner_join(business_df, business_start_date , by="business_id")

# closing date is the max review date
      review_df <-    review_df %>%
                         mutate( year = year(date) ) 

            
      max_date <- review_df %>%
                          select( business_id , date )%>%
                          group_by( business_id ) %>%
                          summarize( close_date= max(date) )%>%
                          mutate( year_close = year(close_date) ) 
      
      business_df <- left_join( business_df, max_date , by="business_id")
      
# open business get close date 2016/12/31
      business_df[business_df$open==TRUE,"close_date"] <- as.POSIXct("2016-12-31", format="%Y-%m-%d",tz="GMT") 
      business_df[business_df$open==TRUE,c("year_close")] <- c(2016)

For each business we select the main subcategory. We choose the subcategory giving priority to the more frequent.

       subcategories <- business_df[,grep(pattern = "subcat", x=colnames(business_df))]
      
      # remove Restaurants and Food
      subcategories <- lapply( subcategories , function(x) gsub( pattern="Restaurants" , x=  x , replacement = NA ) )
      subcategories <- lapply( subcategories , function(x) gsub( pattern="^Food" , x=  x , replacement = NA ) )
      
      # create complete levels and sort levels per frequency
      subcategories_levels <- levels(subcategories$subcategory_1)
      levels_sorted<- names(sort( table( unlist( subcategories )) , decreasing= TRUE) )
      subcategories <- lapply(subcategories,  function(x) factor(x,levels=levels_sorted) )  
      subcategories_code <- as.data.frame( lapply( subcategories,  function(x) as.numeric(x) ))   
      subcategories_code [is.na(subcategories_code)] <- 999
      
        # min category is the one with higher frequency
      first_subcategory <- apply( subcategories_code , 1, min )
      first_subcategory <- levels_sorted[first_subcategory]
      business_df$first_subcategory <-factor(first_subcategory)
      
      # There are 349 restaurants without subcategory. We filter them out.
      business_df <- business_df %>%
                        filter( !is.na(first_subcategory) )
      
# add year start date per city category      
      business_df <- business_df %>%
                        group_by(city_name , first_subcategory )%>%
                        mutate( category_start= min(year_start_date) )

We then add useful variables to reviews dataset.

    # add  subcategory and city name to reviews dataset     
      review_df <- inner_join(x=review_df,y=business_df[,c("business_id","first_subcategory","city_name")],by="business_id")
    # add subcategory start_year     
      review_df <- inner_join(x=review_df,y=business_df[,c("business_id","city_name","category_start")],by=c("city_name","business_id") )
      
      review_df <- review_df %>%
                      mutate(year_since_start=year-category_start)

Now we prepare the dataset that will be used in our analysis.
In order to compute share of reviews and share of businesses per type of cuisine we compute total reviews and total businesses per city/year.

     # compute reviews per city category year
      # It will be the main inout variable
      reviews_per_city_year <- review_df %>%
                                      group_by( city_name , first_subcategory , year ) %>%
                                      summarize( reviews = n() )
      
      # compute new businesses per city category year
      business_open_per_city_year <- business_df %>%
                                      group_by( city_name , first_subcategory , year_start_date  ) %>%
                                      summarize( business_open = n() )
      colnames(business_open_per_city_year)[3] <- "year"
      
      
      business_close_per_city_year <- business_df %>%
                                      group_by( city_name , first_subcategory , year_close  ) %>%
                                      summarize( business_close = n() )
      colnames(business_close_per_city_year)[3] <- "year"

We simplify the dataset for our analysis building a dataset without the business_id variable.

# every possible combination of city, category , year
      complete_df <-                    
        expand.grid( city_name= unique(business_df$city_name) , 
                     first_subcategory= unique( droplevels(business_df$first_subcategory) ) , 
                     year=unique(business_df$year_start_date)  )
      
      complete_df <- tbl_df(complete_df)  # 13920x3

            
# remove NA subcategory and add total business open close and reviews
      complete_df <- complete_df %>%
                        filter(!is.na(first_subcategory)) %>%
                        arrange(city_name,first_subcategory,year)
      
      complete_df <- left_join(complete_df,business_open_per_city_year,by=c("city_name" , "first_subcategory" , "year") )
      complete_df <- left_join(complete_df,business_close_per_city_year,by=c("city_name" , "first_subcategory" , "year") )
      complete_df <- left_join(complete_df,reviews_per_city_year,by=c("city_name" , "first_subcategory" , "year") )
      
      complete_df [is.na(complete_df)] <- 0

            
# compute existing businesses per city category at the eond of each year
      complete_df <-  complete_df %>%
                          group_by( city_name , first_subcategory ) %>%
                          mutate ( open_cum_business= cumsum(business_open) ,
                                   close_cum_business= cumsum(business_close) ,
                                   existing_business= open_cum_business-close_cum_business  )
      
# remove cumulated count to keep the dataset simpler                  
      complete_df$open_cum_business <- NULL
      complete_df$close_cum_business <- NULL

# replace existing business 0 with reviews in the same year with business close
# because reviews concerned businesses actually closed
      complete_df$existing_business[complete_df$existing_business==0 & complete_df$reviews>0] <- complete_df$business_close[complete_df$existing_business==0 & complete_df$reviews>0]
      
# compute share of reviews with respect to share of business
      reviews_per_city_year <- review_df %>%
                                  group_by( city_name , year ) %>%
                                  summarise( tot_reviews_per_city = n() )
      complete_df <-  left_join(complete_df,reviews_per_city_year,by=c("city_name"  , "year") )
      
      business_per_city_year <- complete_df %>%
                                  group_by( city_name , year ) %>%
                                  summarise( businesses_per_city= sum(existing_business))
      complete_df <-  left_join(complete_df,business_per_city_year,by=c("city_name"  , "year") )
     
      complete_df$review_share       <- complete_df$reviews/complete_df$tot_reviews_per_city
      complete_df$business_share     <- complete_df$existing_business/complete_df$businesses_per_city
      
      complete_df$tot_reviews_per_city    <- NULL
      complete_df$businesses_per_city     <- NULL

We clean the complete_df dataset

# 2015 has too few observation      
# incomplete year
      complete_df <- complete_df %>%
                                filter( year < 2015  )
# 12760 x 9
# remove not existing year per city
      city_first_year <- complete_df %>%
                                    filter ( reviews >=1 ) %>%
                                    group_by(  city_name  ) %>%
                                    summarise( first_year= min(year) )
                                    
      complete_df <- inner_join( complete_df , city_first_year , by= "city_name" ) 
      complete_df <- complete_df %>%
                        filter(year>=first_year)  # 11020x10

We compute the number of existing businesses at the midpoint between the current and the previous year. This variable will be used to compute the ratio between revews share and business share

      complete_df_previous <- complete_df %>%
                                select ( city_name , first_subcategory , year ,  existing_business  ) 
      complete_df_previous$year <- complete_df_previous$year+1 
      colnames(complete_df_previous)[-c(1:3)] <- paste(colnames(complete_df_previous)[-c(1:3)],"t1",sep="_" )
      complete_df <-  left_join(complete_df,complete_df_previous,by=c("city_name" , "first_subcategory" , "year") )

      # we replace NAs of t1 values with end-of-period value
      complete_df_first_values <- complete_df %>%
                                  group_by ( city_name , first_subcategory ) %>%
                                  mutate( first_year = min(year) ) %>%
                                  filter ( year == first_year) %>%
                                  select ( city_name , first_subcategory , year , existing_business )
      
      
      # column integration with original and adjusted columns
      x_values <- semi_join( complete_df, complete_df_first_values , by=c("city_name" , "first_subcategory" , "year") )
      x_values$existing_business_t1 <- complete_df_first_values$existing_business
      
      # remove original columns from complete_df
      complete_df <- anti_join(complete_df,x_values,by=c("city_name" , "first_subcategory" , "year") )
      complete_df <- rbind( complete_df , x_values ) # replace with adjusted subset
      complete_df <- complete_df %>%
                          arrange(city_name,first_subcategory,year)
      
      # review_business_share_ratio is computed with respect to businesses of previous year
      # in order to compute the ratio reviews/existing restaurants
      # I take the average value between existing_business_prev and existing_business
      complete_df$business_midyear <- complete_df$existing_business_t1
      complete_df$business_midyear[is.na(complete_df$business_midyear)] <-                                                      complete_df$existing_business[is.na(complete_df$business_midyear)]
      complete_df$business_midyear <- (complete_df$existing_business+complete_df$existing_business_t1)/2
      # if the number of businesses at the end of period  is 1 then halfway is 1
      complete_df$business_midyear[complete_df$business_midyear<1 & complete_df$business_midyear>0 ] <- 1
      complete_df$business_midyear[complete_df$existing_business<2 & complete_df$existing_business_t1==0] <- 1
      complete_df$business_midyear[complete_df$existing_business==0 & complete_df$existing_business_t1==0] <- 0
      
      # some restaurants have open and close in same year
      complete_df$business_midyear[complete_df$existing_business==0 & complete_df$existing_business_t1==0  ] <-                       complete_df$business_open[complete_df$existing_business==0 & complete_df$existing_business_t1==0  ] 
    
      
     # now I need to compute the business share of midpoints
      business_mid_per_city_year <- complete_df %>%
                                      group_by( city_name , year ) %>%
                                      summarise( businesses_mid_per_city= sum(business_midyear) )
      complete_df <-  left_join(complete_df,business_mid_per_city_year,by=c("city_name"  , "year") )
    
      complete_df$business_midyear_share <- complete_df$business_midyear/complete_df$businesses_mid_per_city
      complete_df$businesses_mid_per_city <- NULL
      complete_df$review_business_share_ratio <- complete_df$review_share/complete_df$business_midyear_share
       
      complete_df <- complete_df %>%
                        filter(!is.na(review_business_share_ratio))  # 4365

Then we add the variables for our analysis:
- SRvsSB vs historical mean at previous period
- business_share vs historical mean

 # compute lagged values of review_business_share_ratio
      SRvsSB_previous <- complete_df %>%
                              select ( city_name , first_subcategory , year, review_business_share_ratio ) 
      SRvsSB_previous$year <- SRvsSB_previous$year+1 
      colnames(SRvsSB_previous)[-c(1:3)] <- paste(colnames(SRvsSB_previous)[-c(1:3)],"t1",sep="_" )
      complete_df <-  left_join(complete_df,SRvsSB_previous,by=c("city_name" , "first_subcategory" , "year") )
     
      
  # historical average lagged values
      complete_df_hist_avg <-complete_df %>% 
                                arrange(city_name,first_subcategory,year)%>%
                                group_by(city_name,first_subcategory)%>%
                                mutate( business_share_hist = cummean(business_share) ,
                                        SRvsSB_avg_hist = cummean( review_business_share_ratio ) ) %>%
                                select (city_name,first_subcategory,year, business_share_hist ,SRvsSB_avg_hist )
        
      complete_df_hist_avg$year <- complete_df_hist_avg$year+1 
      colnames(complete_df_hist_avg)[-c(1:3)] <- paste(colnames(complete_df_hist_avg)[-c(1:3)],"t1",sep="_" )
      
# we add lagged values of business_share historical average
      complete_df <-  left_join(complete_df,complete_df_hist_avg[,1:4],by=c("city_name" , "first_subcategory" , "year") )
    
      
# historical average of SRvsSB_avg_hist is lagged another time to become comparable with review_business_share_ratio_t1
# this way we analyze change in business_share vs change in SRvsSB at previous period
      
      complete_df_hist_avg$year <- complete_df_hist_avg$year+1 
      colnames(complete_df_hist_avg)[-c(1:3)] <- paste(colnames(complete_df_hist_avg)[-c(1:3)],"t2",sep="_" )
      complete_df <-  left_join(complete_df, complete_df_hist_avg[,c(1:3,5)] ,by=c("city_name" , "first_subcategory" , "year") )
      colnames(complete_df)[17] <- "SRvsSB_avg_hist_t2"
      
      colnames(complete_df)[14:17] <- c( "SRvsSB" , "SRvsSB_t1" , "business_share_avg_hist_t1"  ,   "SRvsSB_avg_hist_t2" )
      

      complete_df <- complete_df %>%
                        mutate( SRvsSB_vs_hist= SRvsSB_t1/SRvsSB_avg_hist_t2 ,
                                business_share_vs_hist = business_share/business_share_avg_hist_t1) 
                        
      complete_df <- complete_df %>%
                          filter(!is.na(SRvsSB_vs_hist))  # 3095

Finally We export the datasets for analysis.

    saveRDS(review_df , file= "data/review_df.rds")
    saveRDS(business_df , file= "data/business_df.rds")
    saveRDS(complete_df , file= "data/complete_df.rds")

DSS Capstone - Data Preparation

Massimo Zanetti

Sunday, November 8, 2015