library(readr)
Dataset <- read_csv("C:/Users/coblem/Documents/Home/Matt/Graduate School/LIS-661/Dataset/Dataset.csv")
## Parsed with column specification: ## cols( ## .default = col_double(), ## HRHHID = col_character(), ## Skip2 = col_character(), ## GESTFIPS = col_character(), ## Skip3 = col_character(), ## Skip7 = col_character(), ## Skip8 = col_character(), ## Column29 = col_character() ## )
## See spec(...) for full column specifications.
Now assign this to a data frame
df_Dataset <- data.frame(Dataset)
Lets take a look a the unique value:
sapply(df_Dataset, function(x) length(unique(x)))
## HRHHID HRMONTH HRYEAR4 Skip1 HUSPNISH Skip2 GESTFIPS Skip3 ## 62224 1 1 109 2 63203 51 8776 ## PRTAGE Skip4 PEMARITL Skip5 PESEX PEAFEVER Skip6 PEEDUCA ## 83 2 7 15 3 3 3 17 ## PTDTRACE Skip7 PENATVTY PEMNTVTY PEFNTVTY PRCITSHP Skip8 PES1 ## 27 2130 161 161 160 6 152096 6 ## PES2 Skip9 PES4 PES5 Column29 ## 6 13 15 6 406
So the csv headers imported and were assigned to the columns as field names. There are 9 columns I just don’t care about at the moment, so they are identified as Skip1, Skip2, …
What is the mean age (PRTAGE)?
mean(df_Dataset$PRTAGE)
## [1] 33.8675
Let’s look at a histogram of the age (PRTAGE)
hist(Dataset$PRTAGE)
I’m pretty sure that the PRTAGE is a numeric, but I just want to check:
str(df_Dataset)
## 'data.frame': 152096 obs. of 29 variables: ## $ HRHHID : chr "710004140617571" "710004140617571" "710004140617571" "000110157414767" ... ## $ HRMONTH : num 11 11 11 11 11 11 11 11 11 11 ... ## $ HRYEAR4 : num 2016 2016 2016 2016 2016 ... ## $ Skip1 : num 1201 1201 1201 1201 1201 ... ## $ HUSPNISH: num 0 0 0 0 0 0 0 0 0 0 ... ## $ Skip2 : chr "1 1 1-1 115-1-1-1 18258606 1 3 3 6 2 0 204011 1 2-1-1-1-1 36" "1 1 1-1 115-1-1-1 18258606 1 3 3 6 2 0 204011 1 2-1-1-1-1 36" "1 1 1-1 115-1-1-1 18258606 1 3 3 6 2 0 204011 1 2-1-1-1-1 36" "1 1 1-1 115-1-1-1 17848407 1 4 1 6 2 0 204011 1 2-1-1-1-1 36" ... ## $ GESTFIPS: chr "01" "01" "01" "01" ... ## $ Skip3 : chr "338600002103000 1-1" "338600002103000 4 1" "338600002103000 5 2" "338600002103000 1-1" ... ## $ PRTAGE : num 63 31 5 45 52 19 14 45 48 17 ... ## $ Skip4 : num 0 0 0 0 0 0 0 0 0 0 ... ## $ PEMARITL: num 3 4 -1 1 1 6 -1 1 1 6 ... ## $ Skip5 : num -1 -1 -1 2 1 -1 -1 2 1 -1 ... ## $ PESEX : num 1 1 1 2 1 2 2 2 1 1 ... ## $ PEAFEVER: num 1 1 -1 2 2 2 -1 2 2 2 ... ## $ Skip6 : num 2 2 -1 2 2 2 -1 2 2 2 ... ## $ PEEDUCA : num 44 39 -1 43 44 39 -1 44 39 36 ... ## $ PTDTRACE: num 1 1 1 1 1 1 1 1 1 1 ... ## $ Skip7 : chr "-1 9 1 1 1 1 2 4 2" "-1 9 2 2 1 3 2 5 2" "-1 9 3 2 3 3 2-1 1" "-1 9 1 1 1 1 2 1 2" ... ## $ PENATVTY: num 57 57 57 57 57 57 57 57 57 57 ... ## $ PEMNTVTY: num 57 57 57 57 57 57 57 57 57 57 ... ## $ PEFNTVTY: num 57 57 57 57 57 57 57 57 57 57 ... ## $ PRCITSHP: num 1 1 1 1 1 1 1 1 1 1 ... ## $ Skip8 : chr "0 0 1 5 3-1-1 2-1-1-1 1-1 2-1-1-1-1-1-1-1-1-1-1-1 -1-1-1-1-1-1-1-1-1-1-1 -1-1 -1-1-1-1-1-1-1-1-1-1-1-1-1 -1"| __truncated__ "0 0 2 1 1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1 2-140-1-1 40-1-1-1-1 2-1 2-140-1 40-1 2 5 5-1 2 3 5 2-1-1-1-1-1 -1"| __truncated__ "0 0-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1-1 -1-1-1-1-1-1-1-1-1-1-1 -1-1 -1-1-1-1-1-1-1-1-1-1-1-1-1 -1"| __truncated__ "0 0 1 7 2-1-1 2-1 2-1-1-1-1-1-1 2 2-1-1-1-1-1-1-1 -1-1-1-1-1-1-1-1-1-1-1 -1-1 -1-1-1-1-1-1-1-1-1-1-1-1-1 -1"| __truncated__ ... ## $ PES1 : num 1 1 -1 1 1 2 -1 1 1 -1 ... ## $ PES2 : num -1 -1 -1 -1 -1 2 -1 -1 -1 -1 ... ## $ Skip9 : num -1 -1 -1 -1 -1 -3 -1 -1 -1 -1 ... ## $ PES4 : num -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ... ## $ PES5 : num 1 1 -1 1 1 -1 -1 1 1 -1 ... ## $ Column29: chr "1 6 4 1" "1 1 4 2" "-1-1-1-1" "1 3 2 1" ...
Yep, that’s numeric. Now what’s the mean age?
mean(df_Dataset$PRTAGE)
## [1] 33.8675
And the media age?
median(df_Dataset$PRTAGE)
## [1] 33
Need to create a function to calculate Mode…
Mode = function(x){
ta = table(x)
tam = max(ta)
if (all(ta == tam))
mod = NA
else
if(is.numeric(x))
mod = as.numeric(names(ta)[ta == tam])
else
mod = names(ta)[ta == tam]
return(mod)
}
Mode(df_Dataset$PRTAGE)
## [1] -1
And yes, -1 is a legal value indicating that they choose not to say. I’ll have to figure out how to filter those out.