import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt


stations = pd.read_csv('data-london-underground/lu_stations.csv')
stations


links = pd.read_csv('data-london-underground/lu_links.csv')
links


G = nx.Graph()
G.add_nodes_from(stations['id'])
G.add_edges_from(list(zip(links['station1'], links['station2'])))

nx.draw(G)


nx.draw(G, node_size = 6)


plt.figure(figsize=(16,10))
nx.draw(G, node_size = 40)


plt.figure(figsize=(16,10))

coords = list(zip(stations['longitude'],stations['latitude']))
pos = dict(zip(stations['id'], coords))
nx.draw(G,pos,node_size = 40)


stations_z1 = pd.read_csv('data-london-underground/lu_stations.csv')
stations_z1 = stations_z1[stations_z1['zone']<2]
len(stations_z1)

36


allowed_stations = list(stations_z1['id'])

links_z1 = pd.read_csv('data-london-underground/lu_links.csv')
links_z1 = links_z1.loc[links_z1['station1'].isin(allowed_stations)]
len(links_z1)

57


links_z1 = links_z1.loc[links_z1['station2'].isin(allowed_stations)]
len(links_z1)

54


G_z1 = nx.Graph()
G_z1.add_nodes_from(stations_z1['id'])
G_z1.add_edges_from(list(zip(links_z1['station1'], links_z1['station2'])))

plt.figure(figsize=(16,10))
coords = list(zip(stations_z1['longitude'],stations_z1['latitude']))
pos = dict(zip(stations_z1['id'], coords))
nx.draw(G_z1, pos, node_size = 60)


dict_names = dict(zip(stations['id'],stations['name']))


centralities = pd.DataFrame()
centralities['ID'] = G.nodes()
centralities['Names'] = centralities["ID"].map(lambda x:dict_names[x])
centralities['degree_centr'] = nx.degree_centrality(G).values()
centralities['closeness_centr'] = nx.closeness_centrality(G).values()
centralities['betweenness_centr'] = nx.betweenness_centrality(G).values()
centralities['eigenvector_centr'] = nx.eigenvector_centrality(G).values()


centralities.sort_values(by='degree_centr', ascending=False).head(10).reset_index()[['Names','degree_centr']]


centralities.sort_values(by='closeness_centr', ascending=False).head(10).reset_index()[['Names','closeness_centr']]


centralities.sort_values(by='betweenness_centr', ascending=False).head(10).reset_index()[['Names','betweenness_centr']]


centralities.sort_values(by='eigenvector_centr', ascending=False).head(10).reset_index()[['Names','eigenvector_centr']]


plt.figure(figsize=(16,10))
coords = list(zip(stations['longitude'],stations['latitude']))
pos = dict(zip(stations['id'], coords))
nx.draw(G, pos, with_labels = False, node_color = list(centralities['degree_centr']))


plt.figure(figsize=(16,10))
coords = list(zip(stations['longitude'],stations['latitude']))
pos = dict(zip(stations['id'], coords))
nx.draw(G, pos, with_labels = False, node_color = list(centralities['betweenness_centr']))


from sklearn.cluster import KMeans
import numpy as np


coord = np.array(list(zip(stations['longitude'],stations['latitude'])))


model = KMeans(n_clusters=10)
model.fit(coord)
clust_pred = model.predict(coord)


plot_size   = 20
plot_width  = 10
plot_height = 10

params = {'legend.fontsize': 'large',
          'figure.figsize': (plot_width,plot_height),
          'axes.labelsize': plot_size,
          'axes.titlesize': plot_size,
          'xtick.labelsize': plot_size*0.5,
          'ytick.labelsize': plot_size*0.50,
          'axes.titlepad': 25}
plt.rcParams.update(params)

plt.scatter(coord[:, 0],   
            coord[:, 1],
            c = clust_pred, 
            s=plot_size*2, 
            cmap='Accent')

centers = model.cluster_centers_

plt.scatter(centers[:, 0], 
            centers[:, 1], 
            c = 'red', 
            s=plot_size*10, 
            alpha=0.5);


!pip install yellowbrick

Collecting yellowbrick
  Downloading yellowbrick-1.5-py3-none-any.whl (282 kB)
     |████████████████████████████████| 282 kB 3.4 MB/s eta 0:00:01
Requirement already satisfied: scipy>=1.0.0 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from yellowbrick) (1.7.3)
Requirement already satisfied: scikit-learn>=1.0.0 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from yellowbrick) (1.0.2)
Requirement already satisfied: cycler>=0.10.0 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from yellowbrick) (0.11.0)
Requirement already satisfied: numpy>=1.16.0 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from yellowbrick) (1.21.5)
Requirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from yellowbrick) (3.5.1)
Requirement already satisfied: python-dateutil>=2.7 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.8.2)
Requirement already satisfied: fonttools>=4.22.0 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (4.25.0)
Requirement already satisfied: packaging>=20.0 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (21.3)
Requirement already satisfied: pillow>=6.2.0 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (9.0.1)
Requirement already satisfied: pyparsing>=2.2.1 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (3.0.4)
Requirement already satisfied: kiwisolver>=1.0.1 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.4.2)
Requirement already satisfied: six>=1.5 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.16.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn>=1.0.0->yellowbrick) (2.2.0)
Requirement already satisfied: joblib>=0.11 in /Users/pa01/opt/anaconda3/lib/python3.9/site-packages (from scikit-learn>=1.0.0->yellowbrick) (1.1.0)
Installing collected packages: yellowbrick
Successfully installed yellowbrick-1.5


from yellowbrick.cluster import KElbowVisualizer

visualizer = KElbowVisualizer(model, k=(2,12),timings=False)
visualizer.fit(coord)   # Fit the data to the visualizer
visualizer.show()       # Finalize and render the figure

<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>


model = KMeans(n_clusters=5)
model.fit(coord)
clust_pred = model.predict(coord)

plt.scatter(coord[:, 0],   
            coord[:, 1],
            c = clust_pred, 
            s=plot_size*2, 
            cmap='Accent')

centers = model.cluster_centers_

plt.scatter(centers[:, 0], 
            centers[:, 1], 
            c = 'red', 
            s=plot_size*10, 
            alpha=0.5);


stations['cluster'] = clust_pred
stations


demand = pd.read_csv('data-london-underground/lu_od.csv')
demand


dict_from = dict(zip(stations['id'],np.zeros(len(stations))))
dict_to = dict(zip(stations['id'],np.zeros(len(stations))))


for node in dict_from:
    dict_from[node] = demand.loc[demand['origin_id'] == node, 'demand'].sum()

for node in dict_to:
    dict_to[node] = demand.loc[demand['dest_id'] == node, 'demand'].sum()


stations['tot_departures'] = dict_from.values()
stations['tot_arrivals'] = dict_to.values()
stations


stations.sort_values(by='tot_departures', ascending=False).head(10).reset_index()[['name', 'tot_departures']]


stations.sort_values(by='tot_arrivals', ascending=False).head(10).reset_index()[['name', 'tot_arrivals']]


demand.sort_values(by='demand', ascending=False).head(10).reset_index()[['origin_name', 'dest_name','demand']]


dict_zones = dict(zip(stations['id'],stations['cluster']))

demand_filt = demand[['origin_id','dest_id','demand']]

demand_filt['origin_cluster'] = demand_filt["origin_id"].map(lambda x:dict_zones[x])
demand_filt['dest_cluster'] = demand_filt["dest_id"].map(lambda x:dict_zones[x])

demand_filt=demand_filt[['origin_cluster','dest_cluster','demand']]
demand_filt.groupby(['origin_cluster','dest_cluster'])['demand'].sum().reset_index()

demand_filt

/var/folders/kk/14m2mg8s0w9c_tln749srnrw0000gp/T/ipykernel_59181/3247392080.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demand_filt['origin_cluster'] = demand_filt["origin_id"].map(lambda x:dict_zones[x])


demand_filt.groupby(['origin_cluster','dest_cluster'])['demand'].sum().reset_index()

	id	latitude	longitude	name	zone
0	1	51.5028	-0.2801	Acton Town	3.0
1	8	51.5653	-0.1353	Archway	2.5
2	9	51.6164	-0.1331	Arnos Grove	4.0
3	10	51.5586	-0.1059	Arsenal	2.0
4	11	51.5226	-0.1571	Baker Street	1.0
...	...	...	...	...	...
139	296	51.5120	-0.2239	White City	2.0
140	297	51.5492	-0.2215	Willesden Green	2.5
141	303	51.5975	-0.1097	Wood Green	3.0
142	301	51.6070	0.0341	Woodford	4.0
143	302	51.6179	-0.1856	Woodside Park	4.0

	station1	station2	line	time
0	1	234	10	4
1	1	265	10	4
2	8	124	9	3
3	8	264	9	2
4	9	31	10	3
...	...	...	...	...
164	257	258	9	2
165	261	302	9	3
166	266	303	10	2
167	279	285	7	2
168	288	302	9	1

	Names	degree_centr
0	Green Park	0.041958
1	Oxford Circus	0.034965
2	Waterloo	0.034965
3	Leicester Square	0.027972
4	Bond Street	0.027972
5	Euston	0.027972
6	Finsbury Park	0.027972
7	Piccadilly Circus	0.027972
8	Stockwell	0.027972
9	Tottenham Court Road	0.027972

	Names	closeness_centr
0	Green Park	0.135417
1	Oxford Circus	0.133023
2	Bond Street	0.129882
3	Westminster	0.127565
4	Warren Street	0.126102
5	Piccadilly Circus	0.125000
6	Tottenham Court Road	0.123596
7	Hyde Park Corner	0.123064
8	Victoria	0.122118
9	Waterloo	0.122014

	Names	betweenness_centr
0	Green Park	0.483804
1	Euston	0.372402
2	Oxford Circus	0.340179
3	Warren Street	0.307614
4	Bond Street	0.305722
5	Bank & Monument	0.282445
6	Waterloo	0.250051
7	King's Cross St. Pancras	0.248839
8	Camden Town	0.240914
9	Hyde Park Corner	0.221609

Studying the London Underground¶

Part 1 - Drawing the London Underground network¶

Part 2 - Extraction of network subgraphs¶

Part 3 - Obtaining centrality metrics¶

Part 4 - Generation of centrality visualisations¶

Part 5 - Using K-Means to obtain zones¶

Part 6 - Demand Analysis¶

	station1	station2	line	time
0	1	234	10	4
1	1	265	10	4
2	8	124	9	3
3	8	264	9	2
4	9	31	10	3
...	...	...	...	...
164	257	258	9	2
165	261	302	9	3
166	266	303	10	2
167	279	285	7	2
168	288	302	9	1

	Names	eigenvector_centr
0	Green Park	0.425429
1	Oxford Circus	0.419025
2	Piccadilly Circus	0.368797
3	Bond Street	0.286223
4	Leicester Square	0.265054
5	Tottenham Court Road	0.258158
6	Charing Cross	0.219205
7	Westminster	0.200704
8	Warren Street	0.171072
9	Embankment	0.155901

	name	tot_departures
0	Leicester Square	9384
1	Oxford Circus	7996
2	Tottenham Court Road	7248
3	Piccadilly Circus	6413
4	King's Cross St. Pancras	5667
5	Green Park	5427
6	London Bridge	5362
7	Bank & Monument	5145
8	North Greenwich	4732
9	Holborn	3833

	name	tot_arrivals
0	Waterloo	6890
1	Stratford	5834
2	London Bridge	5105
3	Victoria	5008
4	King's Cross St. Pancras	4893
5	Canada Water	4149
6	Brixton	3888
7	Canning Town	3629
8	Finsbury Park	3168
9	Liverpool Street	2932

	origin_name	dest_name	demand
0	Bank and Monument	Waterloo LU	810
1	Oxford Circus	Victoria LU	740
2	Leicester Square	Waterloo LU	737
3	London Bridge LU	Canada Water	609
4	Piccadilly Circus	Waterloo LU	559
5	North Greenwich	Waterloo LU	548
6	Leicester Square	King's Cross St. Pancras	537
7	Oxford Circus	Brixton LU	535
8	North Greenwich	Stratford	525
9	Tottenham Court Road	Stratford	524

	origin_cluster	dest_cluster	demand
0	3	1	0
1	3	1	1
2	3	1	0
3	3	4	0
4	3	4	1
...	...	...	...
20214	2	4	15
20215	2	4	17
20216	2	4	92
20217	2	2	56
20218	2	2	47

	origin_cluster	dest_cluster	demand
0	0	0	602
1	0	1	80
2	0	2	65
3	0	3	172
4	0	4	318
5	1	0	101
6	1	1	3618
7	1	2	1242
8	1	3	1069
9	1	4	5565
10	2	0	89
11	2	1	952
12	2	2	5672
13	2	3	831
14	2	4	6716
15	3	0	319
16	3	1	1009
17	3	2	972
18	3	3	1878
19	3	4	3383
20	4	0	1729
21	4	1	21237
22	4	2	18631
23	4	3	12432
24	4	4	57638

	station1	station2	line	time
0	1	234	10	4
1	1	265	10	4
2	8	124	9	3
3	8	264	9	2
4	9	31	10	3
...	...	...	...	...
164	257	258	9	2
165	261	302	9	3
166	266	303	10	2
167	279	285	7	2
168	288	302	9	1

	station1	station2	line	time
0	1	234	10	4
1	1	265	10	4
2	8	124	9	3
3	8	264	9	2
4	9	31	10	3
...	...	...	...	...
164	257	258	9	2
165	261	302	9	3
166	266	303	10	2
167	279	285	7	2
168	288	302	9	1