# This is partially equivalent to %pylab inline, except expanded
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import zipfile
zipped = zipfile.ZipFile('data/taxirides.csv.zip')
file_list = zipped.namelist()
print(file_list)
file_in_archive = file_list[0]
with zipped.open(file_in_archive, 'r') as f:
line = f.readline()
print(line)
line2 = f.readline()
print(line2)
with zipped.open(file_list[0]) as f:
data = np.genfromtxt(f, skip_header=1, delimiter=',')
data.shape
data[0]
import pandas
with zipped.open(file_list[0]) as f:
data = pandas.read_csv(f)
data
data.passenger_count.min()
data.passenger_count.max()
Bonus: Also calculate the mean, and beware NaN and ∞!
speed = data.trip_distance / (data.trip_time_in_secs / 3600)
print('Median speed:', speed.median())
speed = speed[np.isfinite(speed)]
print('Mean speed: ', speed.mean())
plt.hist(data.pickup_longitude);
def calculate_median_mad(values):
"""
Calculate the median and MAD (Median Absolute Deviation) from
a set of values.
A MAD is like a standard deviation, but more robust in the face of
outliers.
"""
median = np.median(values)
mad = np.median(abs(values - median))
return median, mad
med_long, mad_long = calculate_median_mad(data.pickup_longitude)
std_long = np.std(data.pickup_longitude)
print(mad_long, std_long)
Now lets make a new Python module! Open a new file, and put this in it:
import numpy as np
def median_mad(values):
"""
Calculate the median and MAD (Median Absolute Deviation) from
a set of values.
A MAD is like a standard deviation, but more robust in the face of
outliers.
"""
values = np.asarray(values)
median = np.median(values)
mad = np.median(abs(values - median))
return median, mad
# Now we import our function from its file
from mad import median_mad
Use the median and MAD to make a histogram of the pickup_longitude, with limits of 20 MAD.
med_long, mad_long = median_mad(data.pickup_longitude)
long_lo = med_long - 20*mad_long
long_hi = med_long + 20*mad_long
plt.hist(data.pickup_longitude, bins=np.linspace(long_lo, long_hi, 100));
med_lat, mad_lat = calculate_median_mad(data.pickup_latitude)
lat_lo = med_lat - 10*mad_lat
lat_hi = med_lat + 10*mad_lat
plt.hist(data.pickup_latitude, bins=np.linspace(lat_lo, lat_hi, 100));
plt.plot(data.dropoff_longitude, data.dropoff_latitude, 'k,')
plt.axis('scaled')
plt.xlim(long_lo, long_hi)
plt.ylim(lat_lo, lat_hi)
Now we go to File -> Download As -> Python (.py), and turn this into a script!
Go to https://docs.python.org/3/library/argparse.html, and use that to allow it to take a different filename!
There are three airports in New York:
Put them in the plot!
nwr = [40.69, -74.174]
jfk = [40.641, -73.778]
lga = [40.777, -73.874]
airports = np.array([nwr, jfk, lga])
airport_lat, airport_long = airports.T
fig, ax = plt.subplots()
ax.plot(airport_long, airport_lat, 'go', alpha=0.2)
ax.plot(data.dropoff_longitude, data.dropoff_latitude, 'k,')
#ax.plot(data.pickup_longitude, data.pickup_latitude, 'r,')
ax.axis('scaled')
ax.set_xlim(long_lo, long_hi)
ax.set_ylim(lat_lo, lat_hi)