uber.Rmd

---
title: "uber analysis"
author: "Krishnan Raman"
date: "10/6/2020"
output: pdf_document
---

```{r setup, include=TRUE}
knitr::opts_chunk$set(echo = TRUE)
rm(list=ls())
library(dplyr)
library(igraph)
```

```{r}
# CHANGE THIS TO LOCAL DRIVE WHERE uber_nyc_data.csv is located
# set nrows to 40 million
setwd("~/R/Stat 695/Project/uber/")
df<-read.csv("uber_nyc_data.csv", nrows=1000*1000*40)

# convert factor vars to formatted numbers
df$distance = as.double(as.character(df$trip_distance))
df$duration = as.double(as.difftime(as.character(df$trip_duration), format = "%H:%M:%S", units = "mins"))

# find 1% & 99% quantiles, eliminate anything beyond
# this helps with cancelled trips, overly long trips & other weird outlier cases
durq = quantile(df$duration,c(0.01, 0.99), names=F, na.rm=T)
disq = quantile(df$distance,c(0.01, 0.99), names=F, na.rm=T)

df2 = df[df$duration > durq[1] & df$duration < durq[2] & df$distance > disq[1] & df$distance < disq[2] & df$origin_taz != "NULL" & df$destination_taz != "NULL",]
df2 = select(df2,2:4, 7:8)

# remove NAs & prev dataframes
final = df2[complete.cases(df2), ]
rm(df,df2)
```

compute fare based on linear combination of time and distance

```{r}

base_fare = 2.55
per_minute = 0.35
per_mile = 1.75
min_fare = 8
final$fare <- mapply(function(dis,dur) {
  max(min_fare, base_fare + per_minute*dur + per_mile*dis)
}, final$distance, final$duration)
# distance distribution, duration distribution
hist(final$distance)
hist(final$duration)
hist(final$fare)

# get summary stats
summary(final)
```
log transform for positive variates
```{r}
final$logdist = log(1.0+final$distance)
final$logdur = log(1.0 + final$duration)
final$logfare = log(final$fare)
```

gamma priors
```{r}
bestgammafit = function(x,title) {
dist_scale = var(x)/mean(x)
dist_shape = mean(x)/dist_scale
plot(density(x), main=title)
px=seq(0,5,0.05)
py=dgamma(px,scale=dist_scale, shape=dist_shape)
lines(px,py, col='red')
}

bestgammafit(final$logdist, "log distance")
bestgammafit(final$logdur, "log duration")
bestgammafit(final$logfare, "log fare")
```
Try exp ( Gamma(1)) & Pareto fits as well.
a-b-c-... circuit for highest fare, least distance
total # of a-b tuples
find fare dist, dist dist per a-b tuple - or maybe mean dist/dur

```{r}
origdest<- group_split(final %>% group_by(origin_taz,destination_taz))
# get the median fare per src-dest tuple
mid<-sapply(1:length(origdest), function(i) {quantile(origdest[[i]]$fare)[3]})
sorted<-sort(unname(mid), decreasing = TRUE, index.return=TRUE)
topk=20
indices<-sorted$ix[1:topk]
origins<-sapply(indices, function(i) { origdest[[i]]$origin_taz[1] })
dest<-sapply(indices, function(i) { origdest[[i]]$destination_taz[1] })
common<-intersect(origins, dest)
x<-sapply(1:topk, function(i) { (dest[i] %in% common) & (origins[i] %in% common) })
length(x[x==1])
```
```{r}
sorted$x[1:topk]
```

```{r}
# make graph of original dataset
mylen = length(origdest)
edgematrix<-matrix(NA,nrow=mylen,ncol=3)
for(i in 1:mylen) {
  edgematrix[i,] = c(origdest[[i]]$origin_taz[1],origdest[[i]]$destination_taz[1], quantile(origdest[[i]]$fare)[3]) 
}
g<-graph_from_edgelist(edgematrix[,1:2], directed=FALSE)
plot(g, layout=layout.circle)
```
```{r}
highfare_matrix = edgematrix[as.numeric(edgematrix[,3]) > 60,]
highfare_weights = round(as.numeric(highfare_matrix[,3]))
g2<-graph_from_edgelist(highfare_matrix[,1:2], directed=FALSE)
plot(g2,edge.label= highfare_weights, layout=layout.circle)
plot(g2,edge.label=highfare_weights, layout=layout.davidson.harel)
plot(g2,edge.label=highfare_weights, layout=layout.fruchterman.reingold)
```
```{r}
for(fares in seq(60,40,-2)) {
  highfare_matrix = edgematrix[as.numeric(edgematrix[,3]) > fares,]
  g2<-graph_from_edgelist(highfare_matrix[,1:2], directed=FALSE)
  lengths = c()
  for (v in V(g2)$name) {
    res = all_simple_paths(g2,v)
    lengths = c(lengths, sapply(1:length(res), function(j) { length(res[[j]]) }))
  }
  m = max(lengths)
  cat(sprintf("Fare: %f, Max length: %f\n", fares, m))
  if (m >= 9) break;
}
```

```{r}
# visualize the graph with simple path of length 10
highfare_matrix = edgematrix[as.numeric(edgematrix[,3]) > 44,]
highfare_weights = round(as.numeric(highfare_matrix[,3]))
g3<-graph_from_edgelist(highfare_matrix[,1:2], directed=FALSE)
plot(g3,layout=layout.fruchterman.reingold)
```
```{r}
paths_of_length_10 = c()
found=FALSE
for (src in V(g3)$name) {
    res = all_simple_paths(g3,src)
    for(i in 1:length(res)) {
      if (length(res[[i]]) == 9) {
        last = res[[i]][9]$name
        d = distances(g3,v=last,to=src)
        if (d[1] == 1) {
          print(res[[i]])
          paths_of_length_10 = c(paths_of_length_10, res[[i]])
          found=TRUE
          break
        }
      }
    }
    if(found) break
}
```
```{r}
plot(g3, mark.groups=c("9", "10", "16", "15", "7C", "18", "11", "17", "2A"), mark.col="red")
plot(make_undirected_graph(c("9", "10","10","16", "16","15", "15","7C", "7C","18", "18","11", "11","17", "17","2A", "2A", "9")), edge.color="red")
```