This project report follows the steps given in the instructions pdf file
This step uses read_csv() function to read in the csv as
a tibble.
# Install the necessary packages for the project
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
# Reading the csv
sheet1 <- read_csv("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
sheet1
## # A tibble: 3,198 × 42
## Area_…¹ STCOU EDU01…² EDU01…³ EDU01…⁴ EDU01…⁵ EDU01…⁶ EDU01…⁷ EDU01…⁸ EDU01…⁹
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 UNITED… 00000 0 4.00e7 0000 0000 0 4.00e7 0000 0000
## 2 ALABAMA 01000 0 7.34e5 0000 0000 0 7.28e5 0000 0000
## 3 Autaug… 01001 0 6.83e3 0000 0000 0 6.9 e3 0000 0000
## 4 Baldwi… 01003 0 1.64e4 0000 0000 0 1.65e4 0000 0000
## 5 Barbou… 01005 0 5.07e3 0000 0000 0 5.10e3 0000 0000
## 6 Bibb, … 01007 0 3.56e3 0000 0000 0 3.51e3 0000 0000
## 7 Blount… 01009 0 7.32e3 0000 0000 0 7.22e3 0000 0000
## 8 Bulloc… 01011 0 2.01e3 0000 0000 0 1.98e3 0000 0000
## 9 Butler… 01013 0 4.64e3 0000 0000 0 4.58e3 0000 0000
## 10 Calhou… 01015 0 2.09e4 0000 0000 0 2.09e4 0000 0000
## # … with 3,188 more rows, 32 more variables: EDU010189F <dbl>,
## # EDU010189D <dbl>, EDU010189N1 <chr>, EDU010189N2 <chr>, EDU010190F <dbl>,
## # EDU010190D <dbl>, EDU010190N1 <chr>, EDU010190N2 <chr>, EDU010191F <dbl>,
## # EDU010191D <dbl>, EDU010191N1 <chr>, EDU010191N2 <chr>, EDU010192F <dbl>,
## # EDU010192D <dbl>, EDU010192N1 <chr>, EDU010192N2 <chr>, EDU010193F <dbl>,
## # EDU010193D <dbl>, EDU010193N1 <chr>, EDU010193N2 <chr>, EDU010194F <dbl>,
## # EDU010194D <dbl>, EDU010194N1 <chr>, EDU010194N2 <chr>, EDU010195F <dbl>, …
This steps uses select() function to take a subset of
columns and uses rename() function to rename “Area_name” to
“area_name”.
sheet2 <- sheet1 %>% select("Area_name","STCOU",ends_with("D")) %>% rename("area_name" = Area_name)
sheet2
## # A tibble: 3,198 × 12
## area_…¹ STCOU EDU01…² EDU01…³ EDU01…⁴ EDU01…⁵ EDU01…⁶ EDU01…⁷ EDU01…⁸ EDU01…⁹
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 UNITED… 00000 4.00e7 4.00e7 4.03e7 4.07e7 4.14e7 4.21e7 4.27e7 4.34e7
## 2 ALABAMA 01000 7.34e5 7.28e5 7.30e5 7.28e5 7.26e5 7.26e5 7.28e5 7.31e5
## 3 Autaug… 01001 6.83e3 6.9 e3 6.92e3 6.85e3 7.01e3 7.14e3 7.15e3 7.38e3
## 4 Baldwi… 01003 1.64e4 1.65e4 1.68e4 1.71e4 1.75e4 1.80e4 1.87e4 1.94e4
## 5 Barbou… 01005 5.07e3 5.10e3 5.07e3 5.16e3 5.17e3 5.25e3 5.14e3 5.11e3
## 6 Bibb, … 01007 3.56e3 3.51e3 3.57e3 3.62e3 3.65e3 3.56e3 3.56e3 3.55e3
## 7 Blount… 01009 7.32e3 7.22e3 7.20e3 7.21e3 7.16e3 7.15e3 7.21e3 7.3 e3
## 8 Bulloc… 01011 2.01e3 1.98e3 1.98e3 1.98e3 1.98e3 2.02e3 2.01e3 2.01e3
## 9 Butler… 01013 4.64e3 4.58e3 4.61e3 4.59e3 4.54e3 4.48e3 4.44e3 4.35e3
## 10 Calhou… 01015 2.09e4 2.09e4 2.09e4 2.08e4 2.08e4 2.05e4 2.05e4 2.01e4
## # … with 3,188 more rows, 2 more variables: EDU010195D <dbl>, EDU010196D <dbl>,
## # and abbreviated variable names ¹area_name, ²EDU010187D, ³EDU010188D,
## # ⁴EDU010189D, ⁵EDU010190D, ⁶EDU010191D, ⁷EDU010192D, ⁸EDU010193D,
## # ⁹EDU010194D
pivot_longer() function is used here to convert tibble
to long format.
sheet3 <- sheet2 %>% pivot_longer(cols=3:12,names_to="item_id",values_to="enrollment_value")
sheet3
## # A tibble: 31,980 × 4
## area_name STCOU item_id enrollment_value
## <chr> <chr> <chr> <dbl>
## 1 UNITED STATES 00000 EDU010187D 40024299
## 2 UNITED STATES 00000 EDU010188D 39967624
## 3 UNITED STATES 00000 EDU010189D 40317775
## 4 UNITED STATES 00000 EDU010190D 40737600
## 5 UNITED STATES 00000 EDU010191D 41385442
## 6 UNITED STATES 00000 EDU010192D 42088151
## 7 UNITED STATES 00000 EDU010193D 42724710
## 8 UNITED STATES 00000 EDU010194D 43369917
## 9 UNITED STATES 00000 EDU010195D 43993459
## 10 UNITED STATES 00000 EDU010196D 44715737
## # … with 31,970 more rows
Here substr() function is used to create new variables
“year” and “enrollment_id” from an existing variable “item_id” by using
index position of the values. as.Date() is used to convert
string values to date format.
sheet3$enrollment_id = substr(sheet3$item_id,1,7)
sheet3$year = substr(sheet3$item_id,8,9)
sheet3$year <- as.Date(sheet3$year,format="%y")
sheet3$year <- format(sheet3$year, "%Y")
sheet3$year <- as.integer(sheet3$year)
sheet3
## # A tibble: 31,980 × 6
## area_name STCOU item_id enrollment_value enrollment_id year
## <chr> <chr> <chr> <dbl> <chr> <int>
## 1 UNITED STATES 00000 EDU010187D 40024299 EDU0101 1987
## 2 UNITED STATES 00000 EDU010188D 39967624 EDU0101 1988
## 3 UNITED STATES 00000 EDU010189D 40317775 EDU0101 1989
## 4 UNITED STATES 00000 EDU010190D 40737600 EDU0101 1990
## 5 UNITED STATES 00000 EDU010191D 41385442 EDU0101 1991
## 6 UNITED STATES 00000 EDU010192D 42088151 EDU0101 1992
## 7 UNITED STATES 00000 EDU010193D 42724710 EDU0101 1993
## 8 UNITED STATES 00000 EDU010194D 43369917 EDU0101 1994
## 9 UNITED STATES 00000 EDU010195D 43993459 EDU0101 1995
## 10 UNITED STATES 00000 EDU010196D 44715737 EDU0101 1996
## # … with 31,970 more rows
grep() is used to fetch the index values which satisfies
the condition given in the function. These index values are used to
subset the original tibble into county and
state tibbles. For the county level tibble, a class
called ‘county’ is added to county tibble and similarly for the state
level tibble, a class called ‘state’ is added to state tibble.
county_index_vector <- grep(pattern = ", \\w\\w", sheet3$area_name)
county <- subset(sheet3, row.names(sheet3) %in% county_index_vector)
state <- subset(sheet3,!(row.names(sheet3) %in% county_index_vector))
state$area_name <- toupper(state$area_name)
class(county) <- c("county", class(county))
class(state) <- c("state", class(state))
county
## # A tibble: 31,450 × 6
## area_name STCOU item_id enrollment_value enrollment_id year
## <chr> <chr> <chr> <dbl> <chr> <int>
## 1 Autauga, AL 01001 EDU010187D 6829 EDU0101 1987
## 2 Autauga, AL 01001 EDU010188D 6900 EDU0101 1988
## 3 Autauga, AL 01001 EDU010189D 6920 EDU0101 1989
## 4 Autauga, AL 01001 EDU010190D 6847 EDU0101 1990
## 5 Autauga, AL 01001 EDU010191D 7008 EDU0101 1991
## 6 Autauga, AL 01001 EDU010192D 7137 EDU0101 1992
## 7 Autauga, AL 01001 EDU010193D 7152 EDU0101 1993
## 8 Autauga, AL 01001 EDU010194D 7381 EDU0101 1994
## 9 Autauga, AL 01001 EDU010195D 7568 EDU0101 1995
## 10 Autauga, AL 01001 EDU010196D 7834 EDU0101 1996
## # … with 31,440 more rows
state
## # A tibble: 530 × 6
## area_name STCOU item_id enrollment_value enrollment_id year
## <chr> <chr> <chr> <dbl> <chr> <int>
## 1 UNITED STATES 00000 EDU010187D 40024299 EDU0101 1987
## 2 UNITED STATES 00000 EDU010188D 39967624 EDU0101 1988
## 3 UNITED STATES 00000 EDU010189D 40317775 EDU0101 1989
## 4 UNITED STATES 00000 EDU010190D 40737600 EDU0101 1990
## 5 UNITED STATES 00000 EDU010191D 41385442 EDU0101 1991
## 6 UNITED STATES 00000 EDU010192D 42088151 EDU0101 1992
## 7 UNITED STATES 00000 EDU010193D 42724710 EDU0101 1993
## 8 UNITED STATES 00000 EDU010194D 43369917 EDU0101 1994
## 9 UNITED STATES 00000 EDU010195D 43993459 EDU0101 1995
## 10 UNITED STATES 00000 EDU010196D 44715737 EDU0101 1996
## # … with 520 more rows
str_split_fixed() function is used to split ‘area_name’
variable into 2 new variables ‘county_name’ and its corresponding
‘state’ based on the comma(,) delimiter.
county[c('county_name', 'state')] <- str_split_fixed(county$area_name, ',', 2)
county$state <- trimws(county$state, which = c("both"))
county
## # A tibble: 31,450 × 8
## area_name STCOU item_id enrollment_value enrollmen…¹ year count…² state
## <chr> <chr> <chr> <dbl> <chr> <int> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 EDU0101 1987 Autauga AL
## 2 Autauga, AL 01001 EDU010188D 6900 EDU0101 1988 Autauga AL
## 3 Autauga, AL 01001 EDU010189D 6920 EDU0101 1989 Autauga AL
## 4 Autauga, AL 01001 EDU010190D 6847 EDU0101 1990 Autauga AL
## 5 Autauga, AL 01001 EDU010191D 7008 EDU0101 1991 Autauga AL
## 6 Autauga, AL 01001 EDU010192D 7137 EDU0101 1992 Autauga AL
## 7 Autauga, AL 01001 EDU010193D 7152 EDU0101 1993 Autauga AL
## 8 Autauga, AL 01001 EDU010194D 7381 EDU0101 1994 Autauga AL
## 9 Autauga, AL 01001 EDU010195D 7568 EDU0101 1995 Autauga AL
## 10 Autauga, AL 01001 EDU010196D 7834 EDU0101 1996 Autauga AL
## # … with 31,440 more rows, and abbreviated variable names ¹enrollment_id,
## # ²county_name
For the state tibble, a new categorical variable ‘divison’ is created
which based on the 9 divisions vector mentioned in the code. Here I have
used boolean masking to extract states belonging to a particular
division and used mutate() to add ‘divison’ variable.
Division_1 <- c('CONNECTICUT','MAINE','MASSACHUSETTS','NEW HAMPSHIRE','RHODE ISLAND','VERMONT')
Division_2 <- c('NEW JERSEY','NEW YORK','PENNSYLVANIA')
Division_3 <- c('ILLINOIS','INDIANA','MICHIGAN','OHIO','WISCONSIN')
Division_4 <- c('IOWA','KANSAS','MINNESOTA','MISSOURI','NEBRASKA','NORTH DAKOTA','SOUTH DAKOTA')
Division_5 <- c('DELAWARE','FLORIDA','GEORGIA','MARYLAND','NORTH CAROLINA','SOUTH CAROLINA','VIRGINIA','DISTRICT OF COLUMBIA','WEST VIRGINIA')
Division_6 <- c('ALABAMA','KENTUCKY','MISSISSIPPI','TENNESSEE')
Division_7 <- c('ARKANSAS','LOUISIANA','OKLAHOMA','TEXAS')
Division_8 <- c('ARIZONA','COLORADO','IDAHO','MONTANA','NEVADA','NEW MEXICO','UTAH','WYOMING')
Division_9 <- c('ALASKA','CALIFORNIA','OREGON','WASHINGTON','HAWAII')
state1 <- state[state$area_name %in% Division_1,] %>% mutate(Division="New England")
state2 <- state[state$area_name %in% Division_2,] %>% mutate(Division="Mid-Atlantic")
state3 <- state[state$area_name %in% Division_3,] %>% mutate(Division="East North Central")
state4 <- state[state$area_name %in% Division_4,] %>% mutate(Division="West North Central")
state5 <- state[state$area_name %in% Division_5,] %>% mutate(Division="South Atlantic")
state6 <- state[state$area_name %in% Division_6,] %>% mutate(Division="East South Central")
state7 <- state[state$area_name %in% Division_7,] %>% mutate(Division="West South Central")
state8 <- state[state$area_name %in% Division_8,] %>% mutate(Division="Mountain")
state9 <- state[state$area_name %in% Division_9,] %>% mutate(Division="Pacific")
state_fed <- state[!state$area_name %in% c(Division_1,Division_2,Division_3,Division_4,Division_5,Division_6,Division_7,Division_8,Division_9),] %>% mutate(Division="ERROR")
state <- bind_rows(state1,state2,state3,state4,state5,state6,state7,state8,state9,state_fed)
state
## # A tibble: 530 × 7
## area_name STCOU item_id enrollment_value enrollment_id year Division
## <chr> <chr> <chr> <dbl> <chr> <int> <chr>
## 1 CONNECTICUT 09000 EDU010187D 445646 EDU0101 1987 New England
## 2 CONNECTICUT 09000 EDU010188D 465279 EDU0101 1988 New England
## 3 CONNECTICUT 09000 EDU010189D 461519 EDU0101 1989 New England
## 4 CONNECTICUT 09000 EDU010190D 461560 EDU0101 1990 New England
## 5 CONNECTICUT 09000 EDU010191D 469013 EDU0101 1991 New England
## 6 CONNECTICUT 09000 EDU010192D 481049 EDU0101 1992 New England
## 7 CONNECTICUT 09000 EDU010193D 488476 EDU0101 1993 New England
## 8 CONNECTICUT 09000 EDU010194D 496303 EDU0101 1994 New England
## 9 CONNECTICUT 09000 EDU010195D 506824 EDU0101 1995 New England
## 10 CONNECTICUT 09000 EDU010196D 517935 EDU0101 1996 New England
## # … with 520 more rows
This function takes url has an input, reads the csv files and returns a tibble.
read_csv_code <- function(url){
# Install the necessary packages for the project
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
#Reading the csv
sheet1 <- read_csv(url)
return(sheet1)
}
This function combines step 1 and step 2 from previous section. It
takes an input tibble which is the output of
read_csv_code(). Along with this also takes ‘var_name’ as
input which is used to decide the variable name of the values. This is
an optional argument and default value for this passed through the
my_wrapper() which is discussed in the further sections.
Since function_for_step_1_2() is not called independently I
have passed the default value for this using my_wrapper()
function.
function_for_step_1_2 <- function(sheet1,var_name){
sheet2 <- sheet1 %>% select("Area_name","STCOU",ends_with("D")) %>% rename("area_name" = Area_name)
sheet3 <- sheet2 %>% pivot_longer(cols=3:12,names_to="item_id",values_to=var_name)
return(sheet3)
}
This function implements step 3 from previous section. It takes the
output of function_for_step_1_2() as input and returns a
tibble with ‘year’ variable extracted as an integer.
function_for_step_3 <- function(sheet3){
sheet3$enrollment_id = substr(sheet3$item_id,1,7)
sheet3$year = substr(sheet3$item_id,8,9)
sheet3$year <- as.Date(sheet3$year,format="%y")
sheet3$year <- format(sheet3$year, "%Y")
sheet3$year <- as.integer(sheet3$year)
return(sheet3)
}
This function implements step 5 from previous section. It receives county tibble as an input and extracts the abbreviation of state from ‘area_name’ as a new variable.
function_for_step_5 <- function(county){
county[c('county_name', 'state')] <- str_split_fixed(county$area_name, ',', 2)
county$state <- trimws(county$state, which = c("both"))
return(county)
}
This function implements step 6 from previous section. It takes state tibble as input and creates a new categorical variable ‘division’.
function_for_step_6 <- function(state){
Division_1 <- c('CONNECTICUT','MAINE','MASSACHUSETTS','NEW HAMPSHIRE','RHODE ISLAND','VERMONT')
Division_2 <- c('NEW JERSEY','NEW YORK','PENNSYLVANIA')
Division_3 <- c('ILLINOIS','INDIANA','MICHIGAN','OHIO','WISCONSIN')
Division_4 <- c('IOWA','KANSAS','MINNESOTA','MISSOURI','NEBRASKA','NORTH DAKOTA','SOUTH DAKOTA')
Division_5 <- c('DELAWARE','FLORIDA','GEORGIA','MARYLAND','NORTH CAROLINA','SOUTH CAROLINA','VIRGINIA','DISTRICT OF COLUMBIA','WEST VIRGINIA')
Division_6 <- c('ALABAMA','KENTUCKY','MISSISSIPPI','TENNESSEE')
Division_7 <- c('ARKANSAS','LOUISIANA','OKLAHOMA','TEXAS')
Division_8 <- c('ARIZONA','COLORADO','IDAHO','MONTANA','NEVADA','NEW MEXICO','UTAH','WYOMING')
Division_9 <- c('ALASKA','CALIFORNIA','OREGON','WASHINGTON','HAWAII')
state1 <- state[state$area_name %in% Division_1,] %>% mutate(Division="New England")
state2 <- state[state$area_name %in% Division_2,] %>% mutate(Division="Mid-Atlantic")
state3 <- state[state$area_name %in% Division_3,] %>% mutate(Division="East North Central")
state4 <- state[state$area_name %in% Division_4,] %>% mutate(Division="West North Central")
state5 <- state[state$area_name %in% Division_5,] %>% mutate(Division="South Atlantic")
state6 <- state[state$area_name %in% Division_6,] %>% mutate(Division="East South Central")
state7 <- state[state$area_name %in% Division_7,] %>% mutate(Division="West South Central")
state8 <- state[state$area_name %in% Division_8,] %>% mutate(Division="Mountain")
state9 <- state[state$area_name %in% Division_9,] %>% mutate(Division="Pacific")
state_fed <- state[!state$area_name %in% c(Division_1,Division_2,Division_3,Division_4,Division_5,Division_6,Division_7,Division_8,Division_9),] %>% mutate(Division="ERROR")
state <- bind_rows(state1,state2,state3,state4,state5,state6,state7,state8,state9,state_fed)
return(state)
}
This function implements step 4,5 and 6.
function_for_step_6() and
function_for_step_5() are not called independently but are
only called in this function. This function takes the output of
function_for_step_3() as input and splits the tibble into
state and county tibble and returns them as a list. Since the processing
of both step 5 and step 6 takes place within this function the state
tibble has a variable ‘division’ and county tibble has variable
‘state’.
function_for_steps4_5_6 <- function(sheet3){
county_index_vector <- grep(pattern = ", \\w\\w", sheet3$area_name)
county <- subset(sheet3, row.names(sheet3) %in% county_index_vector)
state <- subset(sheet3,!(row.names(sheet3) %in% county_index_vector))
state$area_name <- toupper(state$area_name)
class(county) <- c("county", class(county))
class(state) <- c("state", class(state))
county <- function_for_step_5(county)
state <- function_for_step_6(state)
return(list(county=county,state=state))
}
This function does all the processing steps mentioned above. It takes in the url which we want to process/plot and an optional argument “var_name” and returns state and county tibble as a list.
my_wrapper <- function(url,var_name="enrollment_value"){
a <- read_csv_code(url)
#print(a)
b <- function_for_step_1_2(a,var_name)
#print(b)
c <- function_for_step_3(b)
#print(c)
d <- function_for_steps4_5_6(c)
return(d)
}
Here I have made a call to my_wrapper() function to
process ‘EDU01b.csv’ with var_name as “enroll_value”.
d2 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv",var_name = "enroll_value")
d2
## $county
## # A tibble: 31,450 × 8
## area_name STCOU item_id enroll_value enrollment_id year county_…¹ state
## <chr> <chr> <chr> <dbl> <chr> <int> <chr> <chr>
## 1 Autauga, AL 01001 EDU010197D 8099 EDU0101 1997 Autauga AL
## 2 Autauga, AL 01001 EDU010198D 8211 EDU0101 1998 Autauga AL
## 3 Autauga, AL 01001 EDU010199D 8489 EDU0101 1999 Autauga AL
## 4 Autauga, AL 01001 EDU010200D 8912 EDU0102 2000 Autauga AL
## 5 Autauga, AL 01001 EDU010201D 8626 EDU0102 2001 Autauga AL
## 6 Autauga, AL 01001 EDU010202D 8762 EDU0102 2002 Autauga AL
## 7 Autauga, AL 01001 EDU015203D 9105 EDU0152 2003 Autauga AL
## 8 Autauga, AL 01001 EDU015204D 9200 EDU0152 2004 Autauga AL
## 9 Autauga, AL 01001 EDU015205D 9559 EDU0152 2005 Autauga AL
## 10 Autauga, AL 01001 EDU015206D 9652 EDU0152 2006 Autauga AL
## # … with 31,440 more rows, and abbreviated variable name ¹county_name
##
## $state
## # A tibble: 530 × 7
## area_name STCOU item_id enroll_value enrollment_id year Division
## <chr> <chr> <chr> <dbl> <chr> <int> <chr>
## 1 CONNECTICUT 09000 EDU010197D 525632 EDU0101 1997 New England
## 2 CONNECTICUT 09000 EDU010198D 535164 EDU0101 1998 New England
## 3 CONNECTICUT 09000 EDU010199D 544698 EDU0101 1999 New England
## 4 CONNECTICUT 09000 EDU010200D 554266 EDU0102 2000 New England
## 5 CONNECTICUT 09000 EDU010201D 562179 EDU0102 2001 New England
## 6 CONNECTICUT 09000 EDU010202D 570228 EDU0102 2002 New England
## 7 CONNECTICUT 09000 EDU015203D 577203 EDU0152 2003 New England
## 8 CONNECTICUT 09000 EDU015204D 577390 EDU0152 2004 New England
## 9 CONNECTICUT 09000 EDU015205D 575058 EDU0152 2005 New England
## 10 CONNECTICUT 09000 EDU015206D 574909 EDU0152 2006 New England
## # … with 520 more rows
Calling my_wrapper() function twice to process the 2 csv
files and passing the output of wrapper functions as input to
combine_Data() function. combine_Data()
vertically concatenates the state tibbles from both the csv files as
well as the county tibbles.
output_d1 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv")
output_d1$county
## # A tibble: 31,450 × 8
## area_name STCOU item_id enrollment_value enrollmen…¹ year count…² state
## <chr> <chr> <chr> <dbl> <chr> <int> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 EDU0101 1987 Autauga AL
## 2 Autauga, AL 01001 EDU010188D 6900 EDU0101 1988 Autauga AL
## 3 Autauga, AL 01001 EDU010189D 6920 EDU0101 1989 Autauga AL
## 4 Autauga, AL 01001 EDU010190D 6847 EDU0101 1990 Autauga AL
## 5 Autauga, AL 01001 EDU010191D 7008 EDU0101 1991 Autauga AL
## 6 Autauga, AL 01001 EDU010192D 7137 EDU0101 1992 Autauga AL
## 7 Autauga, AL 01001 EDU010193D 7152 EDU0101 1993 Autauga AL
## 8 Autauga, AL 01001 EDU010194D 7381 EDU0101 1994 Autauga AL
## 9 Autauga, AL 01001 EDU010195D 7568 EDU0101 1995 Autauga AL
## 10 Autauga, AL 01001 EDU010196D 7834 EDU0101 1996 Autauga AL
## # … with 31,440 more rows, and abbreviated variable names ¹enrollment_id,
## # ²county_name
output_d1$state
## # A tibble: 530 × 7
## area_name STCOU item_id enrollment_value enrollment_id year Division
## <chr> <chr> <chr> <dbl> <chr> <int> <chr>
## 1 CONNECTICUT 09000 EDU010187D 445646 EDU0101 1987 New England
## 2 CONNECTICUT 09000 EDU010188D 465279 EDU0101 1988 New England
## 3 CONNECTICUT 09000 EDU010189D 461519 EDU0101 1989 New England
## 4 CONNECTICUT 09000 EDU010190D 461560 EDU0101 1990 New England
## 5 CONNECTICUT 09000 EDU010191D 469013 EDU0101 1991 New England
## 6 CONNECTICUT 09000 EDU010192D 481049 EDU0101 1992 New England
## 7 CONNECTICUT 09000 EDU010193D 488476 EDU0101 1993 New England
## 8 CONNECTICUT 09000 EDU010194D 496303 EDU0101 1994 New England
## 9 CONNECTICUT 09000 EDU010195D 506824 EDU0101 1995 New England
## 10 CONNECTICUT 09000 EDU010196D 517935 EDU0101 1996 New England
## # … with 520 more rows
output_d2 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv")
output_d2$county
## # A tibble: 31,450 × 8
## area_name STCOU item_id enrollment_value enrollmen…¹ year count…² state
## <chr> <chr> <chr> <dbl> <chr> <int> <chr> <chr>
## 1 Autauga, AL 01001 EDU010197D 8099 EDU0101 1997 Autauga AL
## 2 Autauga, AL 01001 EDU010198D 8211 EDU0101 1998 Autauga AL
## 3 Autauga, AL 01001 EDU010199D 8489 EDU0101 1999 Autauga AL
## 4 Autauga, AL 01001 EDU010200D 8912 EDU0102 2000 Autauga AL
## 5 Autauga, AL 01001 EDU010201D 8626 EDU0102 2001 Autauga AL
## 6 Autauga, AL 01001 EDU010202D 8762 EDU0102 2002 Autauga AL
## 7 Autauga, AL 01001 EDU015203D 9105 EDU0152 2003 Autauga AL
## 8 Autauga, AL 01001 EDU015204D 9200 EDU0152 2004 Autauga AL
## 9 Autauga, AL 01001 EDU015205D 9559 EDU0152 2005 Autauga AL
## 10 Autauga, AL 01001 EDU015206D 9652 EDU0152 2006 Autauga AL
## # … with 31,440 more rows, and abbreviated variable names ¹enrollment_id,
## # ²county_name
output_d2$state
## # A tibble: 530 × 7
## area_name STCOU item_id enrollment_value enrollment_id year Division
## <chr> <chr> <chr> <dbl> <chr> <int> <chr>
## 1 CONNECTICUT 09000 EDU010197D 525632 EDU0101 1997 New England
## 2 CONNECTICUT 09000 EDU010198D 535164 EDU0101 1998 New England
## 3 CONNECTICUT 09000 EDU010199D 544698 EDU0101 1999 New England
## 4 CONNECTICUT 09000 EDU010200D 554266 EDU0102 2000 New England
## 5 CONNECTICUT 09000 EDU010201D 562179 EDU0102 2001 New England
## 6 CONNECTICUT 09000 EDU010202D 570228 EDU0102 2002 New England
## 7 CONNECTICUT 09000 EDU015203D 577203 EDU0152 2003 New England
## 8 CONNECTICUT 09000 EDU015204D 577390 EDU0152 2004 New England
## 9 CONNECTICUT 09000 EDU015205D 575058 EDU0152 2005 New England
## 10 CONNECTICUT 09000 EDU015206D 574909 EDU0152 2006 New England
## # … with 520 more rows
bind_rows() is used to vertically concatenate state and
county tibbles from each of the csv files. Below given is the function
definition for combine_data()
combine_data <- function(list1,list2){
df_county <- bind_rows(list1$county,list2$county)
df_state <- bind_rows(list1$state,list2$state)
return(list(county=df_county,state=df_state))
}
Below is the function call for combine_data(). We get
two tibbles as output: state and
county
df_combined <- combine_data(output_d1,output_d2)
county <- df_combined$county
state <- df_combined$state
county
## # A tibble: 62,900 × 8
## area_name STCOU item_id enrollment_value enrollmen…¹ year count…² state
## <chr> <chr> <chr> <dbl> <chr> <int> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 EDU0101 1987 Autauga AL
## 2 Autauga, AL 01001 EDU010188D 6900 EDU0101 1988 Autauga AL
## 3 Autauga, AL 01001 EDU010189D 6920 EDU0101 1989 Autauga AL
## 4 Autauga, AL 01001 EDU010190D 6847 EDU0101 1990 Autauga AL
## 5 Autauga, AL 01001 EDU010191D 7008 EDU0101 1991 Autauga AL
## 6 Autauga, AL 01001 EDU010192D 7137 EDU0101 1992 Autauga AL
## 7 Autauga, AL 01001 EDU010193D 7152 EDU0101 1993 Autauga AL
## 8 Autauga, AL 01001 EDU010194D 7381 EDU0101 1994 Autauga AL
## 9 Autauga, AL 01001 EDU010195D 7568 EDU0101 1995 Autauga AL
## 10 Autauga, AL 01001 EDU010196D 7834 EDU0101 1996 Autauga AL
## # … with 62,890 more rows, and abbreviated variable names ¹enrollment_id,
## # ²county_name
state
## # A tibble: 1,060 × 7
## area_name STCOU item_id enrollment_value enrollment_id year Division
## <chr> <chr> <chr> <dbl> <chr> <int> <chr>
## 1 CONNECTICUT 09000 EDU010187D 445646 EDU0101 1987 New England
## 2 CONNECTICUT 09000 EDU010188D 465279 EDU0101 1988 New England
## 3 CONNECTICUT 09000 EDU010189D 461519 EDU0101 1989 New England
## 4 CONNECTICUT 09000 EDU010190D 461560 EDU0101 1990 New England
## 5 CONNECTICUT 09000 EDU010191D 469013 EDU0101 1991 New England
## 6 CONNECTICUT 09000 EDU010192D 481049 EDU0101 1992 New England
## 7 CONNECTICUT 09000 EDU010193D 488476 EDU0101 1993 New England
## 8 CONNECTICUT 09000 EDU010194D 496303 EDU0101 1994 New England
## 9 CONNECTICUT 09000 EDU010195D 506824 EDU0101 1995 New England
## 10 CONNECTICUT 09000 EDU010196D 517935 EDU0101 1996 New England
## # … with 1,050 more rows
Plot function for states.
This plots the mean value of the statistic across the years for each
Division. So I have used group_by() on Division and year
and then used summarise() across the mean value. This
function also used filter() to remove the rows for which
the Division is “ERROR”. ggplot2 is used to plot a line
plot. labs() is used to rename the x-axis and y-axis.
plot.state <- function(df, var_name="enrollment_value"){
df_state <- df %>% group_by(Division,year) %>% summarise(mean_enrollment = mean(get(var_name)))
df_state <- df_state %>% filter(Division!="ERROR")
ggplot(df_state, aes(x=year,y=mean_enrollment,color=Division)) +
geom_line() + labs(y="Mean enrollment", x ="Year")
}
Plot function for county.
This function is used to plot county data with a few more options. This
function lets the user decide the state of interest, the highest or the
lowest values and the number of records the user wants to analyse on the
plot.ggplot2 is used to plot a line plot.
labs() is used to rename the x-axis and y-axis.
plot.county <- function(df, var_name="enrollment_value",def_state="NC",group="top",num_find=5){
df_county <- df %>% filter(state==def_state) %>% group_by(area_name) %>% summarise(mean_enrollment = mean(get(var_name)))
#print(df_county)
if(group=="top"){
df_county <- df_county %>% arrange(desc(mean_enrollment))
} else if (group=="bottom"){
df_county <- df_county %>% arrange(mean_enrollment)
}
#print(df_county)
if (group=="top"){
data_subset <- head(df_county,num_find)
} else if(group=="bottom") {
data_subset <- head(df_county,num_find)
}
county_vector <- unique(data_subset$area_name)
#print(county_vector)
filtered_df <- df[df$area_name %in% county_vector,]
ggplot(filtered_df,aes(x=year,y=get(var_name),color=area_name)) +
geom_line() + labs(y="Count", x ="Year")
}
Data processing function is run on the two enrollment URLs given previously, enrollment data column is named as “population”. We get output1 and output2.
output1 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01a.csv",var_name="population")
output2 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/EDU01b.csv",var_name="population")
The state tibbles and county tibbles from the 2 csv files are vertically concatenated together to get output_1_2_county and output_1_2_state
output_1_2_combined <- combine_data(output1,output2)
output_1_2_county <- output_1_2_combined$county
output_1_2_state <- output_1_2_combined$state
output_1_2_combined
## $county
## # A tibble: 62,900 × 8
## area_name STCOU item_id population enrollment_id year county_name state
## <chr> <chr> <chr> <dbl> <chr> <int> <chr> <chr>
## 1 Autauga, AL 01001 EDU010187D 6829 EDU0101 1987 Autauga AL
## 2 Autauga, AL 01001 EDU010188D 6900 EDU0101 1988 Autauga AL
## 3 Autauga, AL 01001 EDU010189D 6920 EDU0101 1989 Autauga AL
## 4 Autauga, AL 01001 EDU010190D 6847 EDU0101 1990 Autauga AL
## 5 Autauga, AL 01001 EDU010191D 7008 EDU0101 1991 Autauga AL
## 6 Autauga, AL 01001 EDU010192D 7137 EDU0101 1992 Autauga AL
## 7 Autauga, AL 01001 EDU010193D 7152 EDU0101 1993 Autauga AL
## 8 Autauga, AL 01001 EDU010194D 7381 EDU0101 1994 Autauga AL
## 9 Autauga, AL 01001 EDU010195D 7568 EDU0101 1995 Autauga AL
## 10 Autauga, AL 01001 EDU010196D 7834 EDU0101 1996 Autauga AL
## # … with 62,890 more rows
##
## $state
## # A tibble: 1,060 × 7
## area_name STCOU item_id population enrollment_id year Division
## <chr> <chr> <chr> <dbl> <chr> <int> <chr>
## 1 CONNECTICUT 09000 EDU010187D 445646 EDU0101 1987 New England
## 2 CONNECTICUT 09000 EDU010188D 465279 EDU0101 1988 New England
## 3 CONNECTICUT 09000 EDU010189D 461519 EDU0101 1989 New England
## 4 CONNECTICUT 09000 EDU010190D 461560 EDU0101 1990 New England
## 5 CONNECTICUT 09000 EDU010191D 469013 EDU0101 1991 New England
## 6 CONNECTICUT 09000 EDU010192D 481049 EDU0101 1992 New England
## 7 CONNECTICUT 09000 EDU010193D 488476 EDU0101 1993 New England
## 8 CONNECTICUT 09000 EDU010194D 496303 EDU0101 1994 New England
## 9 CONNECTICUT 09000 EDU010195D 506824 EDU0101 1995 New England
## 10 CONNECTICUT 09000 EDU010196D 517935 EDU0101 1996 New England
## # … with 1,050 more rows
Plot function on the state data frame
output_1_2_state. As this has class “state” associated
with it the plot will check that and call plot.state under
the hood.
plot(output_1_2_state,var_name="population")
Plot function on the county data frame
output_1_2_county for different combinations for state
of interest, the top or bottom values and for the number of records to
be plotted. As this has class “county” associated with it the plot will
check that and call plot.county under the hood.
plot(output_1_2_county,var_name="population",def_state="PA",group="top",num_find=7)
plot(output_1_2_county,var_name="population",def_state="PA",group="bottom",num_find=4)
plot(output_1_2_county,var_name="population")
plot(output_1_2_county,var_name="population",def_state="MN",group="top",num_find=10)
Data processing function is run on the four enrollment URLs, enrollment data column is given the default value as no optional argument is passed here.
output3 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01a.csv")
output4 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01b.csv")
output5 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01c.csv")
output6 <- my_wrapper("https://www4.stat.ncsu.edu/~online/datasets/PST01d.csv")
The state tibbles and county tibbles from all the four csv files are vertically concatenated together to get final_output_county and final_output_state
output_3_4 <- combine_data(output3,output4)
output_5_6 <- combine_data(output5,output6)
final_output <- combine_data(output_3_4,output_5_6)
final_output_county <- final_output$county
final_output_state <- final_output$state
final_output
## $county
## # A tibble: 125,800 × 8
## area_name STCOU item_id enrollment_value enrollmen…¹ year count…² state
## <chr> <chr> <chr> <dbl> <chr> <int> <chr> <chr>
## 1 Autauga, AL 01001 PST015171D 25508 PST0151 1971 Autauga AL
## 2 Autauga, AL 01001 PST015172D 27166 PST0151 1972 Autauga AL
## 3 Autauga, AL 01001 PST015173D 28463 PST0151 1973 Autauga AL
## 4 Autauga, AL 01001 PST015174D 29266 PST0151 1974 Autauga AL
## 5 Autauga, AL 01001 PST015175D 29718 PST0151 1975 Autauga AL
## 6 Autauga, AL 01001 PST015176D 29896 PST0151 1976 Autauga AL
## 7 Autauga, AL 01001 PST015177D 30462 PST0151 1977 Autauga AL
## 8 Autauga, AL 01001 PST015178D 30882 PST0151 1978 Autauga AL
## 9 Autauga, AL 01001 PST015179D 32055 PST0151 1979 Autauga AL
## 10 Autauga, AL 01001 PST025181D 31985 PST0251 1981 Autauga AL
## # … with 125,790 more rows, and abbreviated variable names ¹enrollment_id,
## # ²county_name
##
## $state
## # A tibble: 2,120 × 7
## area_name STCOU item_id enrollment_value enrollment_id year Division
## <chr> <chr> <chr> <dbl> <chr> <int> <chr>
## 1 CONNECTICUT 09000 PST015171D 3061448 PST0151 1971 New England
## 2 CONNECTICUT 09000 PST015172D 3069617 PST0151 1972 New England
## 3 CONNECTICUT 09000 PST015173D 3069140 PST0151 1973 New England
## 4 CONNECTICUT 09000 PST015174D 3075784 PST0151 1974 New England
## 5 CONNECTICUT 09000 PST015175D 3084726 PST0151 1975 New England
## 6 CONNECTICUT 09000 PST015176D 3085984 PST0151 1976 New England
## 7 CONNECTICUT 09000 PST015177D 3088729 PST0151 1977 New England
## 8 CONNECTICUT 09000 PST015178D 3094839 PST0151 1978 New England
## 9 CONNECTICUT 09000 PST015179D 3099907 PST0151 1979 New England
## 10 CONNECTICUT 09000 PST025181D 3128849 PST0251 1981 New England
## # … with 2,110 more rows
Plot function on the state data frame
final_output_state. As this has class “state”
associated with it the plot will check that and call
plot.state under the hood.
plot(final_output_state)
Plot function on the county data frame
final_output_county for different combinations for
state of interest, the top or bottom values and for the number of
records to be plotted. As this has class “county” associated with it the
plot will check that and call plot.county under the
hood.
plot(final_output_county,def_state="CT",group="top",num_find=6)
plot(final_output_county,def_state="NC",group="bottom",num_find=10)
plot(final_output_county)
plot(final_output_county,def_state="MN",group="top",num_find=4)