better visualization of trips through agglomerative clustering
TRANSCRIPT
BetterVisualizationofTripsThroughAgglomerativeClustering
Anbarasan S
February 2, 2016
1.ProblemStatement:
Visualization of flow/mobility data on a map always gives a cluttered view, even for small dataset .Hence it is difficult , to derive any insights or make decisions out of it.
2.Solution:
Devise a methodology to group or aggregate similar flows
3.Methodology:
Step 1: K-Nearest Neighbours
1.a. Find the k-nearest neighbors for the Origin location of particular Trip/Flow.
1.b.Find The k-nearest Neighbour For the destination location of particular Trip/Flow.
Step 2: Contiguous Flows
Two flows/Trips ,are said to be contiguous if and only if, it satisfies both the following conditions
1. K-NN of Origin1(Trip1) overlaps with k-NN of Origin2(Trip2).2. K-NN of Destination1(Trip1) overlaps with k-NN of Destination2(Trip2).
Step 3: Agglomerative Clustering
Two flows are clustered in a agglomerative fashion, based on a distance measure defined by the number of nearest neighbours shared
Dist(Trip1,Trip2) = 1- [KNN(O1) η KNN(O2)/k * KNN(D1) η KNN(D2)/k]
O1,O2- Origins Of Trip1 and Trip2 respectively
D1,D2 � destination of Trip1 and Trip2 respectively
Very low value of dist , suggests that the flows are very nearer, and larger value >=1 suggests that flows cannot be clustered together.
Step 4: Visualization
Agglomerative Clusters , when projected on to map, gives meaningful insights
4. Implementation:
Dataset: Taxi Service Trajectory - Prediction Challenge, ECML PKDD 2015 Data Set
https://archive.ics.uci.edu/ml/datasets/Taxi+Service+Trajectory+-+Prediction+Challenge,+ECML+PKDD+2015
tt_trajectory <- read.csv("G:/TaxiTrajectory/train.csv",nrows = 500, stringsAsFactors = FALSE)
head(tt_trajectory,1)
## TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP## 1 1.372637e+18 C NA NA 20000589 137263 6858## DAY_TYPE MISSING_DATA## 1 A False## POLYLINE## 1 [[-8.618643,41.141412],[-8.618499,41.141376],[-8.620326,41.14251],[-8.622153,41.143815],[-8.623953,41.144373],[-8.62668,41.144778],[-8.627373,41.144697],[-8.630226,41.14521],[-8.632746,41.14692],[-8.631738,41.148225],[-8.629938,41.150385],[-8.62911,41.151213],[-8.629128,41.15124],[-8.628786,41.152203],[-8.628687,41.152374],[-8.628759,41.152518],[-8.630838,41.15268],[-8.632323,41.153022],[-8.631144,41.154489],[-8.630829,41.154507],[-8.630829,41.154516],[-8.630829,41.154498],[-8.630838,41.154489]]
4.1.GettheOriginAndDestinationlocationforeachtrip
∑ The Polyline field contains the Longitude � Latitude location details of a trip taken every 15 secs.
∑ The polyline field is parsed using JSON library , to get the Origin and destination locations of each trip
library(RJSONIO)
## Warning: package 'RJSONIO' was built under R version 3.2.3
# Function to get Origin and Destination Location of each trip# using json librarypositions_1 <- function(row){
trajectory_list <- fromJSON(row$POLYLINE)no_of_points <- length(trajectory_list)if(no_of_points>0){lan_lat_pairs<-data.frame(rbind(trajectory_list[[1]],
trajectory_list[[no_of_points]]))return(lan_lat_pairs)}}
coordinates <- data.frame(TripId=c(), Ordinal=c(),Booking_time= c(), Lat=c(), Lon=c(), Status=c())
for (i in 1:nrow(tt_trajectory)) {
lat_lon_df <- positions_1(tt_trajectory[i,])
status <- c("Origin","Destination")
if(!is.null(lat_lon_df)){coordinates <- rbind(coordinates,
data.frame(TripId = tt_trajectory$TRIP_ID[i], Booking_time = tt_trajectory$TIMESTAMP[i],
Lat = lat_lon_df$X2,Lon = lat_lon_df$X1,Status = status ))
}}
coordinates$Status <- factor(coordinates$Status, levels <-c("Origin","Destination"))head(coordinates)
## TripId Booking_time Lat Lon Status## 1 1.372637e+18 1372636858 41.14141 -8.618643 Origin## 2 1.372637e+18 1372636858 41.15449 -8.630838 Destination## 3 1.372637e+18 1372637303 41.15983 -8.639847 Origin## 4 1.372637e+18 1372637303 41.17067 -8.665740 Destination## 5 1.372637e+18 1372636951 41.14036 -8.612964 Origin## 6 1.372637e+18 1372636951 41.14053 -8.615970 Destination
4.2VisualizetheTrips
Visualizethetripsontoamaplibrary(ggmap)
## Warning: package 'ggmap' was built under R version 3.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
lat.centre = median(coordinates$Lat)lon.centre = median(coordinates$Lon)
map.portugal <-ggmap(get_map(location = c(lon.centre,lat.centre),maptype = "roadmap", source="google",zoom = 12))
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=41.157607, -8.615227&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en -EN&sensor=false
map.portugal+geom_point(data = coordinates ,aes(x=Lon,y=Lat,color= Status), size= 1)
## Warning: Removed 5 rows containing missing values (geom_point).
# Connect SOurce and Destination for Each trip
map.portugal+geom_point(data = coordinates ,aes(x=Lon,y=Lat,color= Status), size= 1)+geom_line(data = coordinates,aes(x=Lon,y=Lat, group = TripId ), color =
"blue")
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missin g values (geom_path).
map.portugal+geom_line(data = coordinates,aes(x=Lon,y=Lat, group = TripId ),
color = "blue")
## Warning: Removed 5 rows containing missing values (geom_path).
Projection of 1/100 of the original dataset, gives a cluttered view on the map.
4.3.FindK-NNfororiginpointsanddestinationpointsseparately
4.3.1. Knn for Origin
# Find distance using Haversiandistance
library(dplyr)
library(geosphere)
library(dbscan)
# Function to Convert Ids to TripIdsIds_to_tripIds<- function(row) lat_lon_origin[row,"TripId"]
# Origin of all the tripslat_lon_origin <- filter(coordinates,Status == "Origin") %>%
select(TripId,Lat,Lon) %>%as.matrix()
# Find the actual distance in meters between the origin pointsdistance_matrix_origin <- distm(lat_lon_origin[,c(3,2)],
fun=distHaversine)
# Find kNN For origin# No. of. nearest neighbours = 50knn_origin <- kNN(distance_matrix_origin,k=50)
Origin_NN_ids <- apply(knn_origin$id,1,Ids_to_tripIds)
# Construct nearest neighbour for every origin pointorigin_NN <- data.frame(FlowId = c(),Neighbour_Flows = c())
for (i in 1:nrow(lat_lon_origin)) {origin_NN <- rbind(origin_NN,
data.frame(FlowId = lat_lon_origin[i,"TripId"],Neighbour_Flows =
Origin_NN_ids[,i]))}
head(origin_NN)
## FlowId Neighbour_Flows## 1 1.372637e+18 1.372657e+18## 2 1.372637e+18 1.372662e+18## 3 1.372637e+18 1.372638e+18## 4 1.372637e+18 1.372663e+18## 5 1.372637e+18 1.372656e+18## 6 1.372637e+18 1.372642e+18
4.3.2.Knn for destination
# Destinationlat_lon_dest <- filter(coordinates,Status == "Destination")%>%
select(TripId,Lat,Lon) %>% as.matrix()distance_matrix_dest <- distm(lat_lon_dest[,c(3,2)], fun=distHaversine)
# Find kNN For destinationknn_destination <- kNN(distance_matrix_dest,k=50)dest_NN_ids <- apply(knn_destination$id,1,Ids_to_tripIds)dest_NN <- data.frame(FlowId = c(),Neighbour_Flows = c())
for (i in 1:nrow(lat_lon_dest)) {dest_NN <- rbind(dest_NN,
data.frame(FlowId = lat_lon_dest[i,"TripId"],
Neighbour_Flows = dest_NN_ids[,i]))}
4.4FindContiguousFlowPairflow_pair <- data.frame(flow = c(), Neighbour_flow = c(),
snn_distance = c() )
flow_distance <- function(row,o_NN,d_NN){onn <-
origin_NN[origin_NN$FlowId==row,]$Neighbour_Flowsdnn <-
dest_NN[dest_NN$FlowId==row,]$Neighbour_Flowscommon_o_points <- sum(o_NN %in% onn)/50common_d_points <- sum(d_NN %in% dnn)/50snn <- 1-(common_o_points*common_d_points)return(snn)}
for(i in tt_trajectory$TRIP_ID){
o_NN <- filter(origin_NN,FlowId==i)%>% select(Neighbour_Flows) #nearest neighbours of given origin
d_NN <- filter(dest_NN,FlowId == i )%>%select(Neighbour_Flows) #NNs of given destination
NN_matches <- o_NN[,1] %in% d_NN[,1] #Flows having Common Origin and Common destination
# Contiguous/Nearest Flows for a given flow# Two Flows are said to be Contiguous if they are in# Nearest neighbour of both origin and destination of a given flowcontiguous_flows <-o_NN$Neighbour_Flows[NN_matches]
#dist bw Flows#Arguments Passed:##1.List of Flows Found to be Contiguous to a given flow##2.flow_distance -Function to calculate distance between 2 flowsif(length(contiguous_flows)!=0){ snn_flow_distance <-
sapply(contiguous_flows,flow_distance,o_NN[,1],d_NN[,1])
flow_pair <- rbind(flow_pair,data.frame(flow = i,
Neighbour_flow = contiguous_flows,snn_distance = snn_flow_distance ))
}# print(length(contiguous_flows)) # print(length(snn_flow_distance))
}flow_pair <- flow_pair[flow_pair$snn_distance!=1,]flow_pair <-
flow_pair[order(flow_pair$flow,flow_pair$snn_distance),]
4.5AgglomerativeClustering
## Initialize Clusters
for(i in 1:nrow(tt_trajectory)){ tt_trajectory$Cluster[i] = paste("c",i,sep = "")
## Group Clusters
for(i in 1:nrow(flow_pair)){
flow_1 <- flow_pair$flow[i]
flow_2 <- flow_pair$Neighbour_flow[i]
#Determine the Cluster,the Flows associated with
cluster_label_flow_1 <- tt_trajectory[tt_trajectory$TRIP_ID ==flow_1,]$Cluster
cluster_label_flow_2 <- tt_trajectory[tt_trajectory$TRIP_ID ==flow_2,]$Cluster
#Find Distance Between Clusters
#Step 1:Get All points in a single cluster
flows_with_cluster_1 <- tt_trajectory[tt_trajectory$Cluster ==
cluster_label_flow_1,]$TRIP_ID
flows_with_cluster_2 <- tt_trajectory[tt_trajectory$Cluster ==
cluster_label_flow_2,]$TRIP_ID
#Step 2:Compute Distance Matrix
distance_bw_clusters = data.frame(distances = c())
for(j in flows_with_cluster_1){
for(k in flows_with_cluster_2 ){
rec_in_flowpair <- subset(flow_pair,flow == j &
Neighbour_flow == k)
if(nrow(rec_in_flowpair)!=0){
distance_bw_clusters = rbind(distance_bw_clusters,
data.frame(distances = rec_in_flowpair$snn_distance))
}
}
}
#Step 3:Condition for grouping Clusters Together -Linkage Measures
avg_dist_bw_clusters <- mean(distance_bw_clusters$distances)
if(!is.na(avg_dist_bw_clusters) & avg_dist_bw_clusters < 1){
tt_trajectory[tt_trajectory$Cluster ==
cluster_label_flow_2,]$Cluster <-cluster_label_flow_1
}
}
4.6Visuaizetheclusters
cluster_table <- group_by(tt_trajectory,Cluster) %>%
count(Cluster) %>%
filter(n>1)%>% select(Cluster,n)
Cluster_centroids <- data.frame(ClusterId = c(),Lat = c(),Lon = c(),no_of_flows = c())
for(i in cluster_table$Cluster){
tripids_of_cluster <-tt_trajectory[tt_trajectory$Cluster==i,]$TRIP_ID
lat_lon_o_df <- data.frame(lat_lon_origin)
cluster_o_positions <- filter(lat_lon_o_df,
lat_lon_o_df$TripId %in% tripids_of_cluster)%>%select(Lat,Lon)
Cluster_centroids <- rbind(Cluster_centroids,
data.frame(ClusterId=i,
Lat = apply(cluster_o_positions,2,median)[1],
Lon = apply(cluster_o_positions,2,median)[2],
no_of_flows = cluster_table[cluster_table$Cluster==i,]$n))
lat_lon_d_df <- data.frame(lat_lon_dest)
cluster_d_positions <- filter(lat_lon_d_df,
lat_lon_d_df$TripId %in% tripids_of_cluster) %>% select(Lat,Lon)
Cluster_centroids <- rbind(Cluster_centroids,
data.frame(ClusterId = i,
Lat = apply(cluster_d_positions,2,median)[1],
Lon = apply(cluster_d_positions,2,median)[2],
no_of_flows = cluster_table[cluster_table$Cluster==i,]$n))
}
map.portugal+
geom_line(data = Cluster_centroids,
aes(x=Lon,y=Lat,group =ClusterId, size = no_of_flows, alpha = 1.5),color = "red")
CONCLUSION
FlowClusteringoftrips,thusprovidesameaningfulinsightsabouttheflow,themorebroadertheclusteredflow,ithasmorenumberofactualflows.Onincreasingthenumberofnearestneighbours,theflowclustersgivesaclearpictureoftheflows.Duetoextensivecomputationrequiredforsuchanalysis,Irestrictedtoreducednumberoftrips(500)andreducednumberofnearestneighbours(50)