####################
# Module 7 - Lab
# 1/8/2014
####################

## Part A

# Bike Lanes Dataset: BikeBaltimore is the Department of Transportation's bike program.
# https://data.baltimorecity.gov/Transportation/Bike-Lanes/xzfj-gyms
# Download as a CSV (like the Monuments dataset) in your current working directory

# 1. Using tapply():
# 	(a) Which project category has the longest average bike lane?
tab=tapply(bike$length,bike$project, mean,na.rm=TRUE)
tab[which.max(tab)]

#	(b) What was the average bike lane length per year that they were installed?
bike$dateInstalled[bike$dateInstalled==0] = NA
tapply(bike$length,bike$dateInstalled,mean,na.rm=TRUE)

# 2. (a) Numerically [hint: `quantile()`] and (b) graphically [hint: `hist()` or `plot(density())`]
#	describe the distribution of bike "lane" lengths.
quantile(bike$length)
mean(bike$length)
sd(bike$length)

hist(bike$length)
hist(bike$length,breaks=100)

# 3. Then describe as above, after stratifying by i) type then ii) number of lanes
boxplot(bike$length~bike$type)
boxplot(log2(bike$length+1)~bike$type)

levels(factor(bike$type)) # this is the order of boxes
boxplot(bike$length~bike$numLanes)

tapply(bike$length,bike$type, quantile,na.rm=TRUE)
tapply(bike$length,bike$numLanes, quantile,na.rm=TRUE)

tapply(bike$length,bike$type, quantile, prob=.1)


## Part B

# Download the CSV: http://biostat.jhsph.edu/~ajaffe/files/indicatordeadkids35.csv
# Via: http://www.gapminder.org/data/
# Definition of indicator: How many children the average couple had that die before the age 35.

death = read.csv("http://biostat.jhsph.edu/~ajaffe/files/indicatordeadkids35.csv",
                  as.is=TRUE,row.names=1)
death2 = read.csv("http://biostat.jhsph.edu/~ajaffe/files/indicatordeadkids35.csv",
                 as.is=TRUE)
rownames(death2) = death2$X
death2$X=NULL
rownames(death2)

# 5. How many countries have data in any year?
dim(death)
!is.na(death)[1:5,1:5]
table(rowSums(!is.na(death)))

# 6. When did measurements in the US start?
death["United States",]
# death[death$X=="United States",]
!is.na(death["United States",])
allIndex=  which(!is.na(death["United States",]))
allIndex[1]
i= which(!is.na(death["United States",]))[1]
colnames(death)[i]

## one line version
colnames(death)[which(!is.na(death["United States",]))[1]]

# 7. How many countries, and which, had data the first year of measuring?
!is.na(death[,1])
which(!is.na(death[,1]))
rownames(death)[which(!is.na(death[,1]))]

# 4. Plot the distribution of average country's count across all year.
rowMeans(death,na.rm=TRUE)
hist(rowMeans(death,na.rm=TRUE))

# 5.(a) How many entries are less than 1?
death < 1
sum(death < 1,na.rm=TRUE)
mean(death < 1,na.rm=TRUE)

#	(b) Which array indices do they correspond to? [hint: `arr.ind` argument in `which()`]
head(which(death<1,arr.ind=FALSE))
head(which(death<1,arr.ind=TRUE))
ind =which(death<1,arr.ind=TRUE) 

# 6. Plot the count for each country across year in a line plot [hint: `matplot()`]
matplot(death,type="l")