Code Kata 8

Why do we write code? At one level, we’re trying to solve some particular problem, to add some kind of value to the world. But often there are also secondary objectives: the code has to solve the problem, and it also has to be fast, or easy to maintain, or extend, or whatever. So let’s look at that. For this kata, we’re going to write a program to solve a simple problem, and we’re going to write it with three different sub-objectives. Our program is going do process the dictionary we used in previous kata, this time looking for all six letter words which are composed of two concatenated smaller words. For example: al + bums ⇒ albums bar + ely ⇒ barely be + foul ⇒ befoul con + vex ⇒ convex here + by ⇒ hereby jig + saw ⇒ jigsaw tail + or ⇒ tailor we + aver ⇒ weaver

Make program as readable as you can make it

> f <- read.delim("codekata_wordlist.txt", stringsAsFactors = FALSE)
> colnames(f) <- c("word")
> status <- as.data.frame(cbind(regexpr("[^ a-zA-Z]", f$word)))
> sanitized <- data.frame(word = f$word[c(status < 0)])
> sanitized$word <- tolower(sanitized$word)
> sanitized$word <- as.character(sanitized$word)
> sanitized$word.length <- nchar(sanitized$word)
> t1 = Sys.time()
> condition <- with(sanitized, word.length <= 6)
> sanitized <- sanitized[condition, , drop = F]
> condition <- with(sanitized, word.length == 6)
> words.6 <- sanitized[condition, , drop = F]
> condition <- with(sanitized, word.length == 2)
> words.2 <- sanitized[condition, , drop = F]
> condition <- with(sanitized, word.length == 4)
> words.4 <- sanitized[condition, , drop = F]
> condition <- with(sanitized, word.length == 3)
> words.3 <- sanitized[condition, , drop = F]
> data.24 <- cbind(c(outer(words.2$word, words.4$word, paste, sep = "")))
> data.33 <- cbind(c(outer(words.3$word, words.3$word, paste, sep = "")))
> test <- rbind(data.24, data.33)
> data.test <- data.frame(word = test[, ], stringsAsFactors = FALSE)
> data.merged <- merge(data.test, words.6, by.z = "word", by.y = "word",
+     all.x = TRUE)
> special.words.count <- sum(!is.na(data.merged$word.length))
> special.words <- data.merged[(!is.na(data.merged$word.length)),
+     "word"]
> t2 = Sys.time()
> delta = t2 - t1
> print(special.words.count)
[1] 876
> cat("The time taken to find the special words is", delta, "seconds",
+     "\n")
The time taken to find the special words is 8.378 seconds

There are 876 that satisfy the rule that is mentioned in the codekata.

Optimize the program to run fast fast as you can make it

> f <- read.delim("codekata_wordlist.txt", stringsAsFactors = FALSE)
> colnames(f) <- c("word")
> status <- as.data.frame(cbind(regexpr("[^ a-zA-Z]", f$word)))
> sanitized <- data.frame(word = f$word[c(status < 0)])
> sanitized$word <- tolower(sanitized$word)
> sanitized$word <- as.character(sanitized$word)
> sanitized$word.length <- nchar(sanitized$word)
> t1 <- Sys.time()
> condition <- with(sanitized, word.length == 6)
> words.6 <- sanitized[condition, , drop = F]
> condition <- with(sanitized, word.length == 2 | word.length ==
+     4 | word.length == 3)
> sanitized <- sanitized[condition, , drop = F]
> condition <- with(sanitized, word.length == 2)
> words.2 <- sanitized[condition, , drop = F]
> condition <- with(sanitized, word.length == 4)
> words.4 <- sanitized[condition, , drop = F]
> condition <- with(sanitized, word.length == 3)
> words.3 <- sanitized[condition, , drop = F]
> data.24 <- cbind(c(outer(words.2$word, words.4$word, paste, sep = "")))
> data.33 <- cbind(c(outer(words.3$word, words.3$word, paste, sep = "")))
> test <- rbind(data.24, data.33)
> data.test <- data.frame(word = test[, ], stringsAsFactors = FALSE)
> data.merged <- merge(data.test, words.6, by.z = "word", by.y = "word",
+     all.x = TRUE)
> special.words <- data.merged[(!is.na(data.merged$word.length)),
+     "word"]
> t2 <- Sys.time()
> special.words.count <- sum(!is.na(data.merged$word.length))
> delta <- t2 - t1
> print(special.words.count)
[1] 876
> cat("The time taken to find the special words is", delta, "seconds",
+     "\n")
The time taken to find the special words is 7.949 seconds

Write as extendible a program as you can}

I don’t understand what an extendible program means. Flexible to changes is what I gather it can be used by other

> f <- read.delim("codekata_wordlist.txt", stringsAsFactors = FALSE)
> colnames(f) <- c("word")
> status <- as.data.frame(cbind(regexpr("[^ a-zA-Z]", f$word)))
> sanitized <- data.frame(word = f$word[c(status < 0)])
> sanitized$word <- tolower(sanitized$word)
> sanitized$word <- as.character(sanitized$word)
> sanitized$word.length <- nchar(sanitized$word)
> getSpecialWords <- function(sanitized, len) {
+     condition <- with(sanitized, word.length == len)
+     words.len <- sanitized[condition, , drop = F]
+     special.words <- c()
+     combos <- cbind(1:(len - 1), (len - 1):1)
+     rel.combos <- ifelse(len%%2 == 0, len/2, (len + 1)/2)
+     combos <- combos[1:rel.combos, ]
+     for (i in seq_along(combos[, 1])) {
+         condition <- with(sanitized, word.length == combos[i,
+             1])
+         words.1 <- sanitized[condition, , drop = F]
+         condition <- with(sanitized, word.length == combos[i,
+             2])
+         words.2 <- sanitized[condition, , drop = F]
+         if (dim(words.1)[1] != 0 & dim(words.2)[1] != 0) {
+             test <- cbind(c(outer(words.1$word, words.2$word,
+                 paste, sep = "")))
+             data.test <- data.frame(word = test[, 1], stringsAsFactors = FALSE)
+             data.merged <- merge(data.test, words.len, by.z = "word",
+                 by.y = "word", all.x = TRUE)
+             temp.words <- data.merged[(!is.na(data.merged$word.length)),
+                 "word"]
+             special.words <- c(special.words, temp.words)
+         }
+     }
+     return(special.words)
+ }
> len <- 6
> t1 <- Sys.time()
> res <- getSpecialWords(sanitized, len)
> t2 <- Sys.time()
> delta <- t2 - t1
> print(length(res))
[1] 876
> cat("The time taken to find the special words is", delta, "\n")
The time taken to find the special words is 7.412
> len <- 7
> t1 <- Sys.time()
> res <- getSpecialWords(sanitized, len)
> t2 <- Sys.time()
> delta <- t2 - t1
> print(length(res))
[1] 1551
> cat("The time taken to find the special words is", delta, "\n")
The time taken to find the special words is 1.2769

Learnings - Code is Written by a worst programmer

  • Surprisingly the exendible program took the least amount of time.
  • Extendible program works well for 6, 7 but takes a lot of time from 8 onwards. May be there is a better way to do in Python, may be I need to use some algo to quickly do it.
  • I am realizing that I need to get an undersanding of Python algorithms. I should utilize this golden time to prepare and work on it.

Revisit

I thought that codekata 8 was over as I had converted the document in to pdf as well as asciidoc I went for a break in the evening and while climbing stairs, I realized that I had used the look up in the wrong direction. I came back and coded a very basic version of the look up and the time it took was less than 2 seconds, a massive improvement from the 10 seconds that the code was taking in the morning

The biggest change in the code was I broke the 6 letter words in to various combinations and then looked up these words in the master list of 2,3,4,5 letter words.

Make program as readable as you can make it.

> f <- read.delim("codekata_wordlist.txt", stringsAsFactors = FALSE)
> colnames(f) <- c("word")
> status <- as.data.frame(cbind(regexpr("[^ a-zA-Z]", f$word)))
> sanitized <- data.frame(word = f$word[c(status < 0)])
> sanitized$word <- tolower(sanitized$word)
> sanitized$word <- as.character(sanitized$word)
> sanitized$word.length <- nchar(sanitized$word)
> t1 = Sys.time()
> condition <- with(sanitized, word.length < 6)
> check1 <- sanitized[condition, , drop = F]
> colnames(check1) <- c("word", "a")
> check2 <- sanitized[condition, , drop = F]
> colnames(check2) <- c("word", "b")
> condition <- with(sanitized, word.length == 6)
> model <- sanitized[condition, 1, drop = F]
> model$w1 <- substring(model$word, 1, 2)
> model$w2 <- substring(model$word, 3, 6)
> res <- merge(model, check1, by.x = "w1", by.y = "word", all.x = TRUE)
> res <- merge(res, check2, by.x = "w2", by.y = "word", all.x = TRUE)
> res1 <- res[!is.na(res$a) & !is.na(res$b), 3]
> model <- sanitized[condition, 1, drop = F]
> model$w1 <- substring(model$word, 1, 3)
> model$w2 <- substring(model$word, 4, 6)
> res <- merge(model, check1, by.x = "w1", by.y = "word", all.x = TRUE)
> res <- merge(res, check2, by.x = "w2", by.y = "word", all.x = TRUE)
> res2 <- res[!is.na(res$a) & !is.na(res$b), 3]
> res3 <- c(res1, res2)
> t2 <- Sys.time()
> delta1 <- t2 - t1
> print(length(res3))
[1] 876
> cat("The time taken to find the special words new method", delta1,
+     "seconds", "\n")
The time taken to find the special words new method 0.8970001 seconds

Fantastic. The code runs in less than 1.5 seconds. I will work with this code and answer the remaining questions

Optimize the program to run fast fast as you can make it

> f <- read.delim("codekata_wordlist.txt", stringsAsFactors = FALSE)
> colnames(f) <- c("word")
> status <- as.data.frame(cbind(regexpr("[^ a-zA-Z]", f$word)))
> sanitized <- data.frame(word = f$word[c(status < 0)])
> sanitized$word <- tolower(sanitized$word)
> sanitized$word <- as.character(sanitized$word)
> sanitized$word.length <- nchar(sanitized$word)
> t1 = Sys.time()
> condition <- with(sanitized, word.length < 6)
> check <- sanitized[condition, , drop = F]
> condition <- with(sanitized, word.length == 6)
> model <- sanitized[condition, 1, drop = F]
> model$w1 <- substring(model$word, 1, 2)
> model$w2 <- substring(model$word, 3, 6)
> res <- merge(model, check, by.x = "w1", by.y = "word", all.x = TRUE)
> res <- merge(res, check, by.x = "w2", by.y = "word", all.x = TRUE)
> res1 <- res[!is.na(res[, 4]) & !is.na(res[, 5]), 3]
> model <- sanitized[condition, 1, drop = F]
> model$w1 <- substring(model$word, 1, 3)
> model$w2 <- substring(model$word, 4, 6)
> res <- merge(model, check, by.x = "w1", by.y = "word", all.x = TRUE)
> res <- merge(res, check, by.x = "w2", by.y = "word", all.x = TRUE)
> res2 <- res[!is.na(res[, 4]) & !is.na(res[, 5]), 3]
> res3 <- c(res1, res2)
> t2 <- Sys.time()
> delta1 <- t2 - t1
> print(length(res3))
[1] 876
> cat("The time taken to find the special words new method", delta1,
+     "seconds", "\n")
The time taken to find the special words new method 0.277 seconds

Ok some minor reduction in the time taken

Write as extendible a program as you can

> f <- read.delim("codekata_wordlist.txt", stringsAsFactors = FALSE)
> colnames(f) <- c("word")
> status <- as.data.frame(cbind(regexpr("[^ a-zA-Z]", f$word)))
> sanitized <- data.frame(word = f$word[c(status < 0)])
> sanitized$word <- tolower(sanitized$word)
> sanitized$word <- as.character(sanitized$word)
> sanitized$word.length <- nchar(sanitized$word)
> getSpecialWords <- function(sanitized, len) {
+     condition <- with(sanitized, word.length < len)
+     check <- sanitized[condition, , drop = F]
+     condition <- with(sanitized, word.length == len)
+     model <- sanitized[condition, 1, drop = F]
+     result <- c()
+     combos <- 1:(len - 1)
+     i <- 1
+     for (i in seq_along(combos)) {
+         test <- model
+         test$w1 <- substring(test$word, 1, combos[i])
+         test$w2 <- substring(test$word, (combos[i] + 1), 6)
+         res <- merge(test, check, by.x = "w1", by.y = "word",
+             all.x = TRUE)
+         res <- merge(res, check, by.x = "w2", by.y = "word",
+             all.x = TRUE)
+         res <- res[!is.na(res[, 4]) & !is.na(res[, 5]), 3]
+         result <- c(result, res)
+     }
+     return(result)
+ }
> len <- 6
> t1 <- Sys.time()
> res <- getSpecialWords(sanitized, len)
> t2 <- Sys.time()
> delta <- t2 - t1
> print(length(res))
[1] 1536
> cat("The time taken to find the special words is", delta, "\n")
The time taken to find the special words is 0.546
> len <- 9
> t1 <- Sys.time()
> res <- getSpecialWords(sanitized, len)
> t2 <- Sys.time()
> delta <- t2 - t1
> print(length(res))
[1] 1003
> cat("The time taken to find the special words is", delta, "\n")
The time taken to find the special words is 0.971

Wow.The code takes now 0.7 seconds approximately . From 10 seconds to 0.7 seconds that’s an improvement of 15 times. yahoooooo.

I am so happy that the code is optimized

Learnings

  • Keep thinking. You can come up with surprisingly elegant solutions