In [1]:
# This is partially equivalent to %pylab inline, except expanded

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
In [2]:
import zipfile
In [3]:
zipped = zipfile.ZipFile('data/taxirides.csv.zip')
file_list = zipped.namelist()
print(file_list)
['firstday.csv']

Note

  • Explain how files work
  • Explain with statement
  • Show "unwrapped" with statement
In [4]:
file_in_archive = file_list[0]

with zipped.open(file_in_archive, 'r') as f:
    line = f.readline()
    print(line)
    line2 = f.readline()
    print(line2)
b'passenger_count,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude\n'
b'4,382,1.0,-73.978165,40.757977000000004,-73.989838,40.751171\n'
In [5]:
with zipped.open(file_list[0]) as f:
    data = np.genfromtxt(f, skip_header=1, delimiter=',')
In [6]:
data.shape
Out[6]:
(412630, 7)
In [7]:
data[0]
Out[7]:
array([   4.      ,  382.      ,    1.      ,  -73.978165,   40.757977,
        -73.989838,   40.751171])
In [8]:
import pandas

with zipped.open(file_list[0]) as f:
    data = pandas.read_csv(f)
In [9]:
data
Out[9]:
passenger_count trip_time_in_secs trip_distance pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
0 4 382 1.00 -73.978165 40.757977 -73.989838 40.751171
1 1 131 0.50 -73.992172 40.749954 -73.996750 40.744553
2 2 1537 10.20 -73.862709 40.769142 -73.982079 40.762295
3 1 225 1.20 -73.982384 40.752102 -73.993332 40.736393
4 3 327 1.40 -73.987930 40.749542 -73.978981 40.766735
5 2 453 2.10 -73.984734 40.769310 -73.997787 40.744205
6 2 1361 18.90 -73.783318 40.648636 -73.964249 40.719543
7 2 188 0.80 -73.983711 40.756092 -73.984940 40.748222
8 1 587 2.80 -73.959908 40.806396 -73.983292 40.775673
9 2 927 1.80 -73.989059 40.750572 -73.973686 40.755997
10 1 145 0.50 -73.954796 40.598930 -73.950531 40.604225
11 1 600 3.00 0.000000 0.000000 0.000000 0.000000
12 1 660 3.74 0.000000 0.000000 0.000000 0.000000
13 1 360 1.40 0.000000 0.000000 0.000000 0.000000
14 5 1080 9.93 -73.863800 40.770416 -73.984840 40.732506
15 1 480 2.13 0.000000 0.000000 0.000000 0.000000
16 1 1080 6.16 -73.960670 40.797329 -73.924156 40.760742
17 1 180 1.22 -73.974487 40.746834 -73.984779 40.732513
18 1 240 1.46 -73.983727 40.738075 -73.974617 40.752460
19 2 1200 2.68 -73.980148 40.742676 -73.984367 40.770008
20 1 180 0.51 -73.998253 40.754307 -73.996323 40.758247
21 1 1680 17.88 -73.789696 40.645226 -73.993439 40.736015
22 6 540 1.83 -73.997284 40.766659 -73.993370 40.752247
23 1 1140 9.44 -73.873993 40.773716 -73.996803 40.746346
24 1 0 0.00 0.000000 0.000000 0.000000 0.000000
25 1 0 0.03 0.000000 0.000000 0.000000 0.000000
26 6 300 1.90 0.000000 0.000000 0.000000 0.000000
27 1 840 4.67 -73.991516 40.757980 -74.016251 40.706989
28 6 1560 7.25 -73.992340 40.743896 -73.985527 40.666286
29 2 780 2.18 0.000000 0.000000 0.000000 0.000000
... ... ... ... ... ... ... ...
412600 3 870 2.10 -73.995277 40.759773 -73.991287 40.750641
412601 1 672 3.70 -73.980217 40.722275 -73.983498 40.760700
412602 1 175 0.30 -73.973900 40.762890 -73.976059 40.765587
412603 2 1027 2.00 -73.978600 40.762444 -73.990036 40.737782
412604 1 397 1.70 -73.988541 40.758659 -74.007416 40.742760
412605 1 406 1.80 -73.966995 40.769604 -73.980606 40.752010
412606 2 272 0.90 -73.941254 40.799267 -73.929596 40.798435
412607 2 957 4.60 -73.944550 40.775219 -73.997749 40.746368
412608 3 352 2.10 -74.005272 40.745613 -73.982338 40.763973
412609 4 511 1.70 -73.962547 40.758923 -73.978371 40.748322
412610 1 415 1.30 -73.955765 40.764145 -73.970177 40.752415
412611 2 217 0.60 -74.009560 40.710377 -74.013916 40.702526
412612 1 396 1.80 -73.987267 40.760902 -73.973457 40.782784
412613 1 269 1.00 -73.987587 40.758205 -73.981010 40.763435
412614 2 251 0.90 -73.992310 40.758945 -73.987343 40.766178
412615 1 445 2.10 -73.989174 40.748108 -73.966705 40.764053
412616 1 990 3.20 -74.000465 40.727512 -73.956985 40.703041
412617 1 1008 3.10 -73.985497 40.727898 -73.994278 40.759144
412618 1 317 0.80 -73.988670 40.748573 -73.978668 40.740856
412619 2 484 1.80 -73.979759 40.783775 -73.951759 40.772930
412620 1 494 1.70 -73.958359 40.713409 -73.944252 40.703625
412621 1 1338 11.00 -73.988510 40.731472 -74.034241 40.621216
412622 1 460 0.70 -73.979797 40.753292 -73.990562 40.751411
412623 1 353 1.70 -73.997681 40.736103 -73.991058 40.755836
412624 1 426 1.40 -73.987289 40.766117 -73.994728 40.750374
412625 1 2713 3.10 -73.991989 40.735332 -74.007835 40.720078
412626 1 653 1.60 -73.967957 40.762787 -73.992180 40.763992
412627 2 166 0.60 -74.006851 40.719524 -74.013901 40.715385
412628 1 754 2.40 -73.996071 40.768440 -73.991821 40.743557
412629 2 1545 12.90 -73.990845 40.756229 -73.851639 40.828156

412630 rows × 7 columns

In [10]:
data.passenger_count.min()
Out[10]:
0
In [11]:
data.passenger_count.max()
Out[11]:
6

Exercise: Calculate the median speed of each taxi ride.

Bonus: Also calculate the mean, and beware NaN and ∞!

In [12]:
speed = data.trip_distance / (data.trip_time_in_secs / 3600)
print('Median speed:', speed.median())
speed = speed[np.isfinite(speed)]
print('Mean speed:  ', speed.mean())
Median speed: 14.706927175843694
Mean speed:   18.1353235639
In [13]:
plt.hist(data.pickup_longitude);
In [14]:
def calculate_median_mad(values):
    """
    Calculate the median and MAD (Median Absolute Deviation) from
    a set of values.
    
    A MAD is like a standard deviation, but more robust in the face of
    outliers.
    """
    median = np.median(values)
    mad = np.median(abs(values - median))
    return median, mad
In [15]:
med_long, mad_long = calculate_median_mad(data.pickup_longitude)
std_long = np.std(data.pickup_longitude)
print(mad_long, std_long)
0.012406 9.76513931034

Making a Module

Now lets make a new Python module! Open a new file, and put this in it:

import numpy as np

def median_mad(values):
    """
    Calculate the median and MAD (Median Absolute Deviation) from
    a set of values.

    A MAD is like a standard deviation, but more robust in the face of
    outliers.
    """
    values = np.asarray(values)
    median = np.median(values)
    mad = np.median(abs(values - median))
    return median, mad
In [16]:
# Now we import our function from its file
from mad import median_mad

Exercise

Use the median and MAD to make a histogram of the pickup_longitude, with limits of 20 MAD.

In [17]:
med_long, mad_long = median_mad(data.pickup_longitude)
long_lo = med_long - 20*mad_long
long_hi = med_long + 20*mad_long
plt.hist(data.pickup_longitude, bins=np.linspace(long_lo, long_hi, 100));
In [18]:
med_lat, mad_lat = calculate_median_mad(data.pickup_latitude)
lat_lo = med_lat - 10*mad_lat
lat_hi = med_lat + 10*mad_lat
plt.hist(data.pickup_latitude, bins=np.linspace(lat_lo, lat_hi, 100));
In [19]:
plt.plot(data.dropoff_longitude, data.dropoff_latitude, 'k,')
plt.axis('scaled')
plt.xlim(long_lo, long_hi)
plt.ylim(lat_lo, lat_hi)
Out[19]:
(40.57452900000002, 40.929148999999974)

Now we go to File -> Download As -> Python (.py), and turn this into a script!

Exercise

Go to https://docs.python.org/3/library/argparse.html, and use that to allow it to take a different filename!

Exercise

There are three airports in New York:

  • Newark, 40.69°, -74.174°
  • JFK, at 40.641°, -73.778°
  • LaGuardia, at 40.777°, -73.874°

Put them in the plot!

In [20]:
nwr = [40.69, -74.174]
jfk = [40.641, -73.778]
lga = [40.777, -73.874]

airports = np.array([nwr, jfk, lga])
airport_lat, airport_long = airports.T
In [21]:
fig, ax = plt.subplots()

ax.plot(airport_long, airport_lat, 'go', alpha=0.2)
ax.plot(data.dropoff_longitude, data.dropoff_latitude, 'k,')
#ax.plot(data.pickup_longitude, data.pickup_latitude, 'r,')

ax.axis('scaled')

ax.set_xlim(long_lo, long_hi)
ax.set_ylim(lat_lo, lat_hi)
Out[21]:
(40.57452900000002, 40.929148999999974)