import pandas as pd
import sqlite3
import plotly.io as pio
from plotly import express as px
pio.renderers.default="iframe"NOAA Climate Data
Let’s study how extreme climate change is by using databases and Pandas!
We will start by importing all of the necessary packages we’ll need.
We will be creating a database with three tables: temperatures, stations, and countries. We first will transform the temperature data to make it more clean.
def prepare_df(df):
df = df.set_index(keys=["ID", "Year"])
df = df.stack()
df = df.reset_index()
df = df.rename(columns = {"level_2": "Month", 0: "Temp"})
df["Month"] = df["Month"].str[5:].astype(int)
df["Temp"] = df["Temp"]/100
df["FIPS 10-4"] = df["ID"].str[:2]
return dfWe will now create a database with the filename climate.db
conn = sqlite3.connect("climate.db") #creates database in curr directory called climate.dbAnd now we have a file in our current directory called climate.db! We will now use the iterator ability of pandas to write the dataframe into the database
df_iter = pd.read_csv("temps.csv", chunksize=100000)
for df in df_iter:
df = prepare_df(df)
df.to_sql("temperatures", conn, if_exists="append", index=False)Now we will add tables for the stations and countries. Since these aren’t nearly as large, we won’t be reading them in by chunks.
stations = pd.read_csv("station-metadata.csv")
stations.to_sql("stations", conn, if_exists="replace", index=False)27585
countries = pd.read_csv("countries.csv")
countries.to_sql("countries", conn, if_exists="replace", index=False)279
Now that we’ve entered these as tables into our databases, we’ll look at the data and figure out how to query a climate database.
df.head() #part of the temperatures data| ID | Year | Month | Temp | FIPS 10-4 | |
|---|---|---|---|---|---|
| 0 | USW00014924 | 2016 | 1 | -13.69 | US |
| 1 | USW00014924 | 2016 | 2 | -8.40 | US |
| 2 | USW00014924 | 2016 | 3 | -0.20 | US |
| 3 | USW00014924 | 2016 | 4 | 3.21 | US |
| 4 | USW00014924 | 2016 | 5 | 13.85 | US |
stations.head()| ID | LATITUDE | LONGITUDE | STNELEV | NAME | |
|---|---|---|---|---|---|
| 0 | ACW00011604 | 57.7667 | 11.8667 | 18.0 | SAVE |
| 1 | AE000041196 | 25.3330 | 55.5170 | 34.0 | SHARJAH_INTER_AIRP |
| 2 | AEM00041184 | 25.6170 | 55.9330 | 31.0 | RAS_AL_KHAIMAH_INTE |
| 3 | AEM00041194 | 25.2550 | 55.3640 | 10.4 | DUBAI_INTL |
| 4 | AEM00041216 | 24.4300 | 54.4700 | 3.0 | ABU_DHABI_BATEEN_AIR |
countries.head()| FIPS 10-4 | ISO 3166 | Name | |
|---|---|---|---|
| 0 | AF | AF | Afghanistan |
| 1 | AX | - | Akrotiri |
| 2 | AL | AL | Albania |
| 3 | AG | DZ | Algeria |
| 4 | AQ | AS | American Samoa |
Looks good! We will now set up our cursor for doing SQL commands from the tables, and check that we named them correctly
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())[('temperatures',), ('stations',), ('countries',)]
Now we will make sure that we correctly populated our tables.
cursor.execute("SELECT sql FROM sqlite_master WHERE type='table';")
for result in cursor.fetchall():
print(result[0])CREATE TABLE "temperatures" (
"ID" TEXT,
"Year" INTEGER,
"Month" INTEGER,
"Temp" REAL,
"FIPS 10-4" TEXT
)
CREATE TABLE "stations" (
"ID" TEXT,
"LATITUDE" REAL,
"LONGITUDE" REAL,
"STNELEV" REAL,
"NAME" TEXT
)
CREATE TABLE "countries" (
"FIPS 10-4" TEXT,
"ISO 3166" TEXT,
"Name" TEXT
)
We will test out if we can reproduce the sample output provided in the directions of the homework.
cmd = \
"""
SELECT S.name, S.latitude, S.longitude, T.year, T.month, T.temp
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON T."FIPS 10-4" = C."FIPS 10-4"
WHERE T.year >= 1980 AND T.year <= 2020 AND T.month = 1 AND C.name = "India";
"""
df = pd.read_sql_query(cmd, conn)
df| NAME | LATITUDE | LONGITUDE | Year | Month | Temp | |
|---|---|---|---|---|---|---|
| 0 | PBO_ANANTAPUR | 14.583 | 77.633 | 1980 | 1 | 23.48 |
| 1 | PBO_ANANTAPUR | 14.583 | 77.633 | 1981 | 1 | 24.57 |
| 2 | PBO_ANANTAPUR | 14.583 | 77.633 | 1982 | 1 | 24.19 |
| 3 | PBO_ANANTAPUR | 14.583 | 77.633 | 1983 | 1 | 23.51 |
| 4 | PBO_ANANTAPUR | 14.583 | 77.633 | 1984 | 1 | 24.81 |
| ... | ... | ... | ... | ... | ... | ... |
| 3147 | DARJEELING | 27.050 | 88.270 | 1983 | 1 | 5.10 |
| 3148 | DARJEELING | 27.050 | 88.270 | 1986 | 1 | 6.90 |
| 3149 | DARJEELING | 27.050 | 88.270 | 1994 | 1 | 8.10 |
| 3150 | DARJEELING | 27.050 | 88.270 | 1995 | 1 | 5.60 |
| 3151 | DARJEELING | 27.050 | 88.270 | 1997 | 1 | 5.70 |
3152 rows × 6 columns
We did it! We can now proceed and create the general function for querying based on the user inputs.
def query_climate_database(country, year_begin, year_end, month):
cmd = \
f"""
SELECT S.name, S.latitude, S.longitude, T.year, T.month, T.temp
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON T."FIPS 10-4" = C."FIPS 10-4"
WHERE T.year >= {year_begin} AND T.year <= {year_end} AND T.month = {month}
AND C.name = "{country}";
"""
df = pd.read_sql_query(cmd, conn)
return dfquery_climate_database(country = "India",
year_begin = 1980,
year_end = 2020,
month = 1)| NAME | LATITUDE | LONGITUDE | Year | Month | Temp | |
|---|---|---|---|---|---|---|
| 0 | PBO_ANANTAPUR | 14.583 | 77.633 | 1980 | 1 | 23.48 |
| 1 | PBO_ANANTAPUR | 14.583 | 77.633 | 1981 | 1 | 24.57 |
| 2 | PBO_ANANTAPUR | 14.583 | 77.633 | 1982 | 1 | 24.19 |
| 3 | PBO_ANANTAPUR | 14.583 | 77.633 | 1983 | 1 | 23.51 |
| 4 | PBO_ANANTAPUR | 14.583 | 77.633 | 1984 | 1 | 24.81 |
| ... | ... | ... | ... | ... | ... | ... |
| 3147 | DARJEELING | 27.050 | 88.270 | 1983 | 1 | 5.10 |
| 3148 | DARJEELING | 27.050 | 88.270 | 1986 | 1 | 6.90 |
| 3149 | DARJEELING | 27.050 | 88.270 | 1994 | 1 | 8.10 |
| 3150 | DARJEELING | 27.050 | 88.270 | 1995 | 1 | 5.60 |
| 3151 | DARJEELING | 27.050 | 88.270 | 1997 | 1 | 5.70 |
3152 rows × 6 columns
We’ve done it! Now let’s test this for a different example just to make sure this is working.
query_climate_database(country = "United States",
year_begin = 2000,
year_end = 2010,
month = 12)| NAME | LATITUDE | LONGITUDE | Year | Month | Temp | |
|---|---|---|---|---|---|---|
| 0 | ADDISON | 34.2553 | -87.1814 | 2001 | 12 | 8.43 |
| 1 | ADDISON | 34.2553 | -87.1814 | 2002 | 12 | 4.73 |
| 2 | ADDISON | 34.2553 | -87.1814 | 2005 | 12 | 3.65 |
| 3 | ADDISON | 34.2553 | -87.1814 | 2009 | 12 | 5.33 |
| 4 | ADDISON | 34.2553 | -87.1814 | 2010 | 12 | 2.00 |
| ... | ... | ... | ... | ... | ... | ... |
| 75667 | LINCOLN_11_SW | 40.6953 | -96.8542 | 2006 | 12 | 1.45 |
| 75668 | LINCOLN_11_SW | 40.6953 | -96.8542 | 2007 | 12 | -4.18 |
| 75669 | LINCOLN_11_SW | 40.6953 | -96.8542 | 2008 | 12 | -4.88 |
| 75670 | LINCOLN_11_SW | 40.6953 | -96.8542 | 2009 | 12 | -6.97 |
| 75671 | LINCOLN_11_SW | 40.6953 | -96.8542 | 2010 | 12 | -3.46 |
75672 rows × 6 columns
df = query_climate_database(country = "India",
year_begin = 1980,
year_end = 2020,
month = 1)
df| NAME | LATITUDE | LONGITUDE | Year | Month | Temp | |
|---|---|---|---|---|---|---|
| 0 | PBO_ANANTAPUR | 14.583 | 77.633 | 1980 | 1 | 23.48 |
| 1 | PBO_ANANTAPUR | 14.583 | 77.633 | 1981 | 1 | 24.57 |
| 2 | PBO_ANANTAPUR | 14.583 | 77.633 | 1982 | 1 | 24.19 |
| 3 | PBO_ANANTAPUR | 14.583 | 77.633 | 1983 | 1 | 23.51 |
| 4 | PBO_ANANTAPUR | 14.583 | 77.633 | 1984 | 1 | 24.81 |
| ... | ... | ... | ... | ... | ... | ... |
| 3147 | DARJEELING | 27.050 | 88.270 | 1983 | 1 | 5.10 |
| 3148 | DARJEELING | 27.050 | 88.270 | 1986 | 1 | 6.90 |
| 3149 | DARJEELING | 27.050 | 88.270 | 1994 | 1 | 8.10 |
| 3150 | DARJEELING | 27.050 | 88.270 | 1995 | 1 | 5.60 |
| 3151 | DARJEELING | 27.050 | 88.270 | 1997 | 1 | 5.70 |
3152 rows × 6 columns
df["year_count"] = df.groupby("NAME")["Year"].transform("count")
df| NAME | LATITUDE | LONGITUDE | Year | Month | Temp | year_count | |
|---|---|---|---|---|---|---|---|
| 0 | PBO_ANANTAPUR | 14.583 | 77.633 | 1980 | 1 | 23.48 | 34 |
| 1 | PBO_ANANTAPUR | 14.583 | 77.633 | 1981 | 1 | 24.57 | 34 |
| 2 | PBO_ANANTAPUR | 14.583 | 77.633 | 1982 | 1 | 24.19 | 34 |
| 3 | PBO_ANANTAPUR | 14.583 | 77.633 | 1983 | 1 | 23.51 | 34 |
| 4 | PBO_ANANTAPUR | 14.583 | 77.633 | 1984 | 1 | 24.81 | 34 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 3147 | DARJEELING | 27.050 | 88.270 | 1983 | 1 | 5.10 | 7 |
| 3148 | DARJEELING | 27.050 | 88.270 | 1986 | 1 | 6.90 | 7 |
| 3149 | DARJEELING | 27.050 | 88.270 | 1994 | 1 | 8.10 | 7 |
| 3150 | DARJEELING | 27.050 | 88.270 | 1995 | 1 | 5.60 | 7 |
| 3151 | DARJEELING | 27.050 | 88.270 | 1997 | 1 | 5.70 | 7 |
3152 rows × 7 columns
from sklearn.linear_model import LinearRegression
def coef(data_group):
x = data_group[["Year"]] # 2 brackets because X should be a df
y = data_group["Temp"] # 1 bracket because y should be a series
LR = LinearRegression()
LR.fit(x, y)
return LR.coef_[0]coefs = df.groupby(["NAME", "Month"]).apply(coef)
coefs = coefs.reset_index()
coefs| NAME | Month | 0 | |
|---|---|---|---|
| 0 | AGARTALA | 1 | -0.006184 |
| 1 | AGRA | 1 | -0.095413 |
| 2 | AHMADABAD | 1 | 0.006731 |
| 3 | AKOLA | 1 | -0.008063 |
| 4 | ALLAHABAD | 1 | -0.029375 |
| ... | ... | ... | ... |
| 99 | TRIVANDRUM | 1 | 0.022892 |
| 100 | UDAIPUR_DABOK | 1 | 0.072424 |
| 101 | VARANASI_BABATPUR | 1 | -0.012996 |
| 102 | VERAVAL | 1 | 0.024848 |
| 103 | VISHAKHAPATNAM | 1 | -0.034050 |
104 rows × 3 columns
pd.merge(df, coefs, on="NAME")| NAME | LATITUDE | LONGITUDE | Year | Month_x | Temp | year_count | Month_y | 0 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | PBO_ANANTAPUR | 14.583 | 77.633 | 1980 | 1 | 23.48 | 34 | 1 | 0.026258 |
| 1 | PBO_ANANTAPUR | 14.583 | 77.633 | 1981 | 1 | 24.57 | 34 | 1 | 0.026258 |
| 2 | PBO_ANANTAPUR | 14.583 | 77.633 | 1982 | 1 | 24.19 | 34 | 1 | 0.026258 |
| 3 | PBO_ANANTAPUR | 14.583 | 77.633 | 1983 | 1 | 23.51 | 34 | 1 | 0.026258 |
| 4 | PBO_ANANTAPUR | 14.583 | 77.633 | 1984 | 1 | 24.81 | 34 | 1 | 0.026258 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3147 | DARJEELING | 27.050 | 88.270 | 1983 | 1 | 5.10 | 7 | 1 | -0.040133 |
| 3148 | DARJEELING | 27.050 | 88.270 | 1986 | 1 | 6.90 | 7 | 1 | -0.040133 |
| 3149 | DARJEELING | 27.050 | 88.270 | 1994 | 1 | 8.10 | 7 | 1 | -0.040133 |
| 3150 | DARJEELING | 27.050 | 88.270 | 1995 | 1 | 5.60 | 7 | 1 | -0.040133 |
| 3151 | DARJEELING | 27.050 | 88.270 | 1997 | 1 | 5.70 | 7 | 1 | -0.040133 |
3152 rows × 9 columns
We did it! We will now create a function that deals with plotting how the average yearly changes in temperature vary within a given country.
def temperature_coefficient_plot(country, year_begin, year_end, month, min_obs, **kwargs):
df = query_climate_database(country = country,
year_begin = year_begin,
year_end = year_end,
month = month)
df["YearCount"] = df.groupby("NAME")["Year"].transform("count")
df = df[(df["YearCount"] >= min_obs)]
coefs = df.groupby(["NAME", "Month"]).apply(coef)
coefs = coefs.reset_index()
coefs = coefs.rename(columns={0: "Estimated Yearly Increase (°C)"})
df = pd.merge(df, coefs, on="NAME")
df = df.round(4)
fig = px.scatter_mapbox(data_frame=df,
lat="LATITUDE",
lon="LONGITUDE",
hover_name="NAME",
color="Estimated Yearly Increase (°C)",
**kwargs
)
return figcolor_map = px.colors.diverging.RdGy_r
fig = temperature_coefficient_plot("India", 1980, 2020, 1,
min_obs = 10,
zoom = 2,
mapbox_style="carto-positron",
color_continuous_scale=color_map
)
fig.show()This matches the plot that we were expecting to get! We can now see each station and look at the estimated yearly increase for each of them! Now let’s do another example, but for a different country.
fig = temperature_coefficient_plot("United States", 1960, 2010, 6,
min_obs = 12,
zoom = 2,
mapbox_style="carto-positron",
color_continuous_scale=color_map
)
fig.show()The plot looks great! It seems that the estimated yearly increase has a much higher range than that of the previous plot.