CKTG Web Scraping

Summary

Chef Knives To Go (CKTG) features a catalog of around 1000 knives spread over various form factors and steel make-ups.

Getting an idea of an ideal price point for each form factor and steel type is arduous with there being more than 20 different form factors for a knife, and each form factor having close to 20 different steel types possible.

Process

In order to build a dataset of knives, the most optimal way is to crawl through the website multiple times

CKTG lists knives in multiple ways, the two main ways to search through knives are by form factor, and Steel Type

By first crawling through all of the knives by form factor, then crawling again by steel type and joining the results, we can save time on trying to extract the steel type from individual pages

Using R and the package ‘rvest’ we can load all of the pages into our script and pull the relevant information for each knife

The following views from CKTG show how the website is organized

After clicking through any of the above we are either led to a sub-category, or to a page of knives

For our analysis, this is all of the data we need. By using nested for loops we can navigate past all of the sub-categories and collect all of the knife information

The following is an excerpt of the dataset

For each knife, we are able to store a link to the knife on CKTG, the title, its price, the type of steel used, the manufacturer, the form factor, along with a few other traits

SubType: This is a sub-category after crawling by steel type, we could potentially get extra information from this with more processing

SubStyle: This is a sub-category after crawling by form factor, this represents the different lengths that some knife styles can take

SubSubStyle: This is a further sub-category after crawling by form factor, this represents the different handle styles that some knives will come with (Western vs Japanese)

Code

Most of the data we need can be scraped with the following function:


function(x) {
  html_elements(x, ".section-details") %>%
    html_elements("a") %>%
    html_attrs() %>% 
    tibble() %>%
    unnest_wider(".")
}

This function uses the pipe operator (%>%) to feed each successive command into the next line

html_elements here calls for the CSS styles for each item on the pages listed under Process

html_attrs gets the link for the page that the item corresponds to, as well as getting the title

tibble and unnest_wider are used to transform the ‘nodeset’ we have into a dataframe that is easy to loop over

After looping over these categories we eventually end up at a collection of knives, like the third image in Process

Here we execute the same function as above along with a similar function that pulls the price data

Now that we are onto individual knives, we compile all of the page titles that we traversed which gives us the manufacturer and Steel type

For the second round of web-scraping we do the same thing as above, and simply add in the additional information that we collect about the knife’s form factor.

Link to code: https://youngbloodkyle.github.io/CKTG_Web_Scraping_adv.nb.html


library(rvest)
library(dplyr)
library(tidyr)


# helper function:
# We will use the following function to get the lists of categories and knives from the ChefKnivesToGo website several times

cktg_query <- function(x) {
  html_elements(x, ".section-details") %>%
    html_elements("a") %>%
    html_attrs() %>%
    tibble() %>%
    unnest_wider(".")
}


flag <- TRUE
mat <- matrix(ncol = 0, nrow = 0)
df <- data.frame(mat)
cktg <- 'https://www.chefknivestogo.com/'
cktg_steels <- 'shopbysteel.html'
cktg_steels_read <- read_html(paste(cktg, cktg_steels, sep = ''))
steels <- cktg_steels_read %>%
  cktg_query()

for (x in 1:length(steels$href)) {
  cktg_makers_read <- read_html(paste(cktg, steels$href[x], sep = ''))
  
  makers <- cktg_makers_read %>%
    cktg_query()
  
  
  for (y in 1:length(makers$href)) {
    tryCatch({
      cktg_knives_read <-
        read_html(paste(cktg, makers$href[y], sep = ''))
      
      test <- cktg_knives_read %>% html_node(".price")
      if (!is.na(test)) {
        prices <- cktg_knives_read %>%
          html_nodes('.section-details') %>%
          html_nodes('.price') %>%
          html_text()
        knives <- cktg_knives_read %>%
          cktg_query() %>%
          mutate(
            Prices = prices,
            SteelGroup = steels$title[x],
            Maker = makers$title[y],
            SubType = NA
          )
        if (flag == TRUE) {
          df <- bind_rows(df, knives)
          flag <- FALSE
        }
        else {
          if (sum(!knives$href %in% df$href) > 0) {
            df <- df  %>% bind_rows(knives[which(!knives$href %in% df$href),])
          }
        }
      }
      
      else {
        if (x == 3 & y == 6) {
          cktg_knives_read <-
            read_html(paste(cktg, "kohetsu.html", sep = ''))
        }
        sub_type <- cktg_knives_read %>%
          cktg_query()
        
        for (z in 1:length(sub_type)) {
          cktg_knives_sub_read <- read_html(paste(cktg, sub_type$href[z], sep = ""))
          prices <- cktg_knives_sub_read %>%
            html_nodes('.section-details') %>%
            html_nodes('.price') %>%
            html_text()
          knives <- cktg_knives_sub_read %>%
            cktg_query() %>%
            mutate(
              Prices = prices,
              SteelGroup = steels$title[x],
              Maker = makers$title[y],
              SubType = sub_type$title[z]
            )
          for (t in 1:length(knives$href)) {
            if (knives$href[t] %in% df$href) {
              df$SubType[which(df$href %in% knives$href[t])] = knives$SubType[t]
            }
            else {
              df <- bind_rows(df, knives[t, ])
            }
          }
        }
      }
    },
    error <- function(e) {
      cat("ERROR :", conditionMessage(e), "\n")
    })
  }
}


df$Style <- NA
df$SubStyle <- NA
df$SubSubStyle <- NA
cktg_types <- 'resources.html'
cktg_types_read <- read_html(paste(cktg, cktg_types, sep = ''))
types <- cktg_types_read %>%
  html_nodes('.ysw-lp-row-item') %>%
  html_nodes('h2') %>%
  html_nodes("a") %>%
  html_attrs() %>%
  tibble() %>%
  unnest_wider('.')

for (x in c(1:length(types$href))[-16]) {
  tryCatch({
    cktg_styles_knives_read <- read_html(paste(cktg, types$href[x], sep = ""))
    test <- cktg_styles_knives_read %>% html_node(".price")
    if (!is.na(test)) {
      knives <- cktg_styles_knives_read %>%
        html_nodes('.section-details') %>%
        html_nodes('a') %>%
        html_attr("href") %>%
        tibble() %>%
        unnest_wider('.') %>%
        mutate(Style = types$title[x]) %>%
        rename(href = ...1)
      for (t in 1:length(knives$href)) {
        if (knives$href[t] %in% df$href) {
          df$Style[which(df$href %in% knives$href[t])] <- knives$Style[t]
        }
      }
      
    }
    else {
      styles <- cktg_styles_knives_read %>%
        cktg_query()
      
      for (y in 1:length(styles$href)) {
        cktg_styles_read <- read_html(paste(cktg, styles$href[y], sep = ""))
        test <- cktg_styles_read %>% html_node(".price")
        
        if (!is.na(test)) {
          knives <-  cktg_styles_read %>%
            cktg_query() %>%
            mutate(Style = types$title[x],
                   SubStyle = styles$title[y])
          
          for (t in 1:length(knives$href)) {
            if (knives$href[t] %in% df$href) {
              df$Style[which(df$href %in% knives$href[t])] <- knives$Style[t]
              df$SubStyle[which(df$href %in% knives$href[t])] <- knives$SubStyle[t]
            }
          }
        }
        else {
          sub_type <- cktg_styles_read %>%
            cktg_query()
          for (z in 1:length(sub_type)) {
            cktg_knives_sub_read <- read_html(paste(cktg, sub_type$href[z], sep = ""))
            
            knives <- cktg_knives_sub_read %>%
              cktg_query() %>%
              mutate(
                Style = types$title[x],
                SubStyle = styles$title[y],
                SubSubStyle = sub_type$title[z]
              )
            for (t in 1:length(knives$href)) {
              if (knives$href[t] %in% df$href) {
                df$Style[which(df$href %in% knives$href[t])] <- knives$Style[t]
                df$SubStyle[which(df$href %in% knives$href[t])] <- knives$SubStyle[t]
                df$SubSubStyle[which(df$href %in% knives$href[t])] <- knives$SubSubStyle[t]
              }
            }
          }
        }
        
      }
    }
  },
  error <- function(e) {
    cat("ERROR :", conditionMessage(e), "\n")
  })
}

write.csv(df, file = 'CKTG_KNIFE.csv')