import pandas as pd
import sqlite3
import plotly.io as pio
from plotly import express as px
="iframe" pio.renderers.default
NOAA Climate Data
Let’s study how extreme climate change is by using databases and Pandas!
We will start by importing all of the necessary packages we’ll need.
We will be creating a database with three tables: temperatures, stations, and countries. We first will transform the temperature data to make it more clean.
def prepare_df(df):
= df.set_index(keys=["ID", "Year"])
df = df.stack()
df = df.reset_index()
df = df.rename(columns = {"level_2": "Month", 0: "Temp"})
df "Month"] = df["Month"].str[5:].astype(int)
df["Temp"] = df["Temp"]/100
df["FIPS 10-4"] = df["ID"].str[:2]
df[return df
We will now create a database with the filename climate.db
= sqlite3.connect("climate.db") #creates database in curr directory called climate.db conn
And now we have a file in our current directory called climate.db! We will now use the iterator ability of pandas to write the dataframe into the database
= pd.read_csv("temps.csv", chunksize=100000)
df_iter for df in df_iter:
= prepare_df(df)
df "temperatures", conn, if_exists="append", index=False) df.to_sql(
Now we will add tables for the stations and countries. Since these aren’t nearly as large, we won’t be reading them in by chunks.
= pd.read_csv("station-metadata.csv")
stations "stations", conn, if_exists="replace", index=False) stations.to_sql(
27585
= pd.read_csv("countries.csv")
countries "countries", conn, if_exists="replace", index=False) countries.to_sql(
279
Now that we’ve entered these as tables into our databases, we’ll look at the data and figure out how to query a climate database.
#part of the temperatures data df.head()
ID | Year | Month | Temp | FIPS 10-4 | |
---|---|---|---|---|---|
0 | USW00014924 | 2016 | 1 | -13.69 | US |
1 | USW00014924 | 2016 | 2 | -8.40 | US |
2 | USW00014924 | 2016 | 3 | -0.20 | US |
3 | USW00014924 | 2016 | 4 | 3.21 | US |
4 | USW00014924 | 2016 | 5 | 13.85 | US |
stations.head()
ID | LATITUDE | LONGITUDE | STNELEV | NAME | |
---|---|---|---|---|---|
0 | ACW00011604 | 57.7667 | 11.8667 | 18.0 | SAVE |
1 | AE000041196 | 25.3330 | 55.5170 | 34.0 | SHARJAH_INTER_AIRP |
2 | AEM00041184 | 25.6170 | 55.9330 | 31.0 | RAS_AL_KHAIMAH_INTE |
3 | AEM00041194 | 25.2550 | 55.3640 | 10.4 | DUBAI_INTL |
4 | AEM00041216 | 24.4300 | 54.4700 | 3.0 | ABU_DHABI_BATEEN_AIR |
countries.head()
FIPS 10-4 | ISO 3166 | Name | |
---|---|---|---|
0 | AF | AF | Afghanistan |
1 | AX | - | Akrotiri |
2 | AL | AL | Albania |
3 | AG | DZ | Algeria |
4 | AQ | AS | American Samoa |
Looks good! We will now set up our cursor for doing SQL commands from the tables, and check that we named them correctly
= conn.cursor()
cursor "SELECT name FROM sqlite_master WHERE type='table'")
cursor.execute(print(cursor.fetchall())
[('temperatures',), ('stations',), ('countries',)]
Now we will make sure that we correctly populated our tables.
"SELECT sql FROM sqlite_master WHERE type='table';")
cursor.execute(for result in cursor.fetchall():
print(result[0])
CREATE TABLE "temperatures" (
"ID" TEXT,
"Year" INTEGER,
"Month" INTEGER,
"Temp" REAL,
"FIPS 10-4" TEXT
)
CREATE TABLE "stations" (
"ID" TEXT,
"LATITUDE" REAL,
"LONGITUDE" REAL,
"STNELEV" REAL,
"NAME" TEXT
)
CREATE TABLE "countries" (
"FIPS 10-4" TEXT,
"ISO 3166" TEXT,
"Name" TEXT
)
We will test out if we can reproduce the sample output provided in the directions of the homework.
= \
cmd """
SELECT S.name, S.latitude, S.longitude, T.year, T.month, T.temp
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON T."FIPS 10-4" = C."FIPS 10-4"
WHERE T.year >= 1980 AND T.year <= 2020 AND T.month = 1 AND C.name = "India";
"""
= pd.read_sql_query(cmd, conn)
df df
NAME | LATITUDE | LONGITUDE | Year | Month | Temp | |
---|---|---|---|---|---|---|
0 | PBO_ANANTAPUR | 14.583 | 77.633 | 1980 | 1 | 23.48 |
1 | PBO_ANANTAPUR | 14.583 | 77.633 | 1981 | 1 | 24.57 |
2 | PBO_ANANTAPUR | 14.583 | 77.633 | 1982 | 1 | 24.19 |
3 | PBO_ANANTAPUR | 14.583 | 77.633 | 1983 | 1 | 23.51 |
4 | PBO_ANANTAPUR | 14.583 | 77.633 | 1984 | 1 | 24.81 |
... | ... | ... | ... | ... | ... | ... |
3147 | DARJEELING | 27.050 | 88.270 | 1983 | 1 | 5.10 |
3148 | DARJEELING | 27.050 | 88.270 | 1986 | 1 | 6.90 |
3149 | DARJEELING | 27.050 | 88.270 | 1994 | 1 | 8.10 |
3150 | DARJEELING | 27.050 | 88.270 | 1995 | 1 | 5.60 |
3151 | DARJEELING | 27.050 | 88.270 | 1997 | 1 | 5.70 |
3152 rows × 6 columns
We did it! We can now proceed and create the general function for querying based on the user inputs.
def query_climate_database(country, year_begin, year_end, month):
= \
cmd f"""
SELECT S.name, S.latitude, S.longitude, T.year, T.month, T.temp
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON T."FIPS 10-4" = C."FIPS 10-4"
WHERE T.year >= {year_begin} AND T.year <= {year_end} AND T.month = {month}
AND C.name = "{country}";
"""
= pd.read_sql_query(cmd, conn)
df return df
= "India",
query_climate_database(country = 1980,
year_begin = 2020,
year_end = 1) month
NAME | LATITUDE | LONGITUDE | Year | Month | Temp | |
---|---|---|---|---|---|---|
0 | PBO_ANANTAPUR | 14.583 | 77.633 | 1980 | 1 | 23.48 |
1 | PBO_ANANTAPUR | 14.583 | 77.633 | 1981 | 1 | 24.57 |
2 | PBO_ANANTAPUR | 14.583 | 77.633 | 1982 | 1 | 24.19 |
3 | PBO_ANANTAPUR | 14.583 | 77.633 | 1983 | 1 | 23.51 |
4 | PBO_ANANTAPUR | 14.583 | 77.633 | 1984 | 1 | 24.81 |
... | ... | ... | ... | ... | ... | ... |
3147 | DARJEELING | 27.050 | 88.270 | 1983 | 1 | 5.10 |
3148 | DARJEELING | 27.050 | 88.270 | 1986 | 1 | 6.90 |
3149 | DARJEELING | 27.050 | 88.270 | 1994 | 1 | 8.10 |
3150 | DARJEELING | 27.050 | 88.270 | 1995 | 1 | 5.60 |
3151 | DARJEELING | 27.050 | 88.270 | 1997 | 1 | 5.70 |
3152 rows × 6 columns
We’ve done it! Now let’s test this for a different example just to make sure this is working.
= "United States",
query_climate_database(country = 2000,
year_begin = 2010,
year_end = 12) month
NAME | LATITUDE | LONGITUDE | Year | Month | Temp | |
---|---|---|---|---|---|---|
0 | ADDISON | 34.2553 | -87.1814 | 2001 | 12 | 8.43 |
1 | ADDISON | 34.2553 | -87.1814 | 2002 | 12 | 4.73 |
2 | ADDISON | 34.2553 | -87.1814 | 2005 | 12 | 3.65 |
3 | ADDISON | 34.2553 | -87.1814 | 2009 | 12 | 5.33 |
4 | ADDISON | 34.2553 | -87.1814 | 2010 | 12 | 2.00 |
... | ... | ... | ... | ... | ... | ... |
75667 | LINCOLN_11_SW | 40.6953 | -96.8542 | 2006 | 12 | 1.45 |
75668 | LINCOLN_11_SW | 40.6953 | -96.8542 | 2007 | 12 | -4.18 |
75669 | LINCOLN_11_SW | 40.6953 | -96.8542 | 2008 | 12 | -4.88 |
75670 | LINCOLN_11_SW | 40.6953 | -96.8542 | 2009 | 12 | -6.97 |
75671 | LINCOLN_11_SW | 40.6953 | -96.8542 | 2010 | 12 | -3.46 |
75672 rows × 6 columns
= query_climate_database(country = "India",
df = 1980,
year_begin = 2020,
year_end = 1)
month df
NAME | LATITUDE | LONGITUDE | Year | Month | Temp | |
---|---|---|---|---|---|---|
0 | PBO_ANANTAPUR | 14.583 | 77.633 | 1980 | 1 | 23.48 |
1 | PBO_ANANTAPUR | 14.583 | 77.633 | 1981 | 1 | 24.57 |
2 | PBO_ANANTAPUR | 14.583 | 77.633 | 1982 | 1 | 24.19 |
3 | PBO_ANANTAPUR | 14.583 | 77.633 | 1983 | 1 | 23.51 |
4 | PBO_ANANTAPUR | 14.583 | 77.633 | 1984 | 1 | 24.81 |
... | ... | ... | ... | ... | ... | ... |
3147 | DARJEELING | 27.050 | 88.270 | 1983 | 1 | 5.10 |
3148 | DARJEELING | 27.050 | 88.270 | 1986 | 1 | 6.90 |
3149 | DARJEELING | 27.050 | 88.270 | 1994 | 1 | 8.10 |
3150 | DARJEELING | 27.050 | 88.270 | 1995 | 1 | 5.60 |
3151 | DARJEELING | 27.050 | 88.270 | 1997 | 1 | 5.70 |
3152 rows × 6 columns
"year_count"] = df.groupby("NAME")["Year"].transform("count")
df[ df
NAME | LATITUDE | LONGITUDE | Year | Month | Temp | year_count | |
---|---|---|---|---|---|---|---|
0 | PBO_ANANTAPUR | 14.583 | 77.633 | 1980 | 1 | 23.48 | 34 |
1 | PBO_ANANTAPUR | 14.583 | 77.633 | 1981 | 1 | 24.57 | 34 |
2 | PBO_ANANTAPUR | 14.583 | 77.633 | 1982 | 1 | 24.19 | 34 |
3 | PBO_ANANTAPUR | 14.583 | 77.633 | 1983 | 1 | 23.51 | 34 |
4 | PBO_ANANTAPUR | 14.583 | 77.633 | 1984 | 1 | 24.81 | 34 |
... | ... | ... | ... | ... | ... | ... | ... |
3147 | DARJEELING | 27.050 | 88.270 | 1983 | 1 | 5.10 | 7 |
3148 | DARJEELING | 27.050 | 88.270 | 1986 | 1 | 6.90 | 7 |
3149 | DARJEELING | 27.050 | 88.270 | 1994 | 1 | 8.10 | 7 |
3150 | DARJEELING | 27.050 | 88.270 | 1995 | 1 | 5.60 | 7 |
3151 | DARJEELING | 27.050 | 88.270 | 1997 | 1 | 5.70 | 7 |
3152 rows × 7 columns
from sklearn.linear_model import LinearRegression
def coef(data_group):
= data_group[["Year"]] # 2 brackets because X should be a df
x = data_group["Temp"] # 1 bracket because y should be a series
y = LinearRegression()
LR
LR.fit(x, y)return LR.coef_[0]
= df.groupby(["NAME", "Month"]).apply(coef)
coefs = coefs.reset_index()
coefs coefs
NAME | Month | 0 | |
---|---|---|---|
0 | AGARTALA | 1 | -0.006184 |
1 | AGRA | 1 | -0.095413 |
2 | AHMADABAD | 1 | 0.006731 |
3 | AKOLA | 1 | -0.008063 |
4 | ALLAHABAD | 1 | -0.029375 |
... | ... | ... | ... |
99 | TRIVANDRUM | 1 | 0.022892 |
100 | UDAIPUR_DABOK | 1 | 0.072424 |
101 | VARANASI_BABATPUR | 1 | -0.012996 |
102 | VERAVAL | 1 | 0.024848 |
103 | VISHAKHAPATNAM | 1 | -0.034050 |
104 rows × 3 columns
="NAME") pd.merge(df, coefs, on
NAME | LATITUDE | LONGITUDE | Year | Month_x | Temp | year_count | Month_y | 0 | |
---|---|---|---|---|---|---|---|---|---|
0 | PBO_ANANTAPUR | 14.583 | 77.633 | 1980 | 1 | 23.48 | 34 | 1 | 0.026258 |
1 | PBO_ANANTAPUR | 14.583 | 77.633 | 1981 | 1 | 24.57 | 34 | 1 | 0.026258 |
2 | PBO_ANANTAPUR | 14.583 | 77.633 | 1982 | 1 | 24.19 | 34 | 1 | 0.026258 |
3 | PBO_ANANTAPUR | 14.583 | 77.633 | 1983 | 1 | 23.51 | 34 | 1 | 0.026258 |
4 | PBO_ANANTAPUR | 14.583 | 77.633 | 1984 | 1 | 24.81 | 34 | 1 | 0.026258 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3147 | DARJEELING | 27.050 | 88.270 | 1983 | 1 | 5.10 | 7 | 1 | -0.040133 |
3148 | DARJEELING | 27.050 | 88.270 | 1986 | 1 | 6.90 | 7 | 1 | -0.040133 |
3149 | DARJEELING | 27.050 | 88.270 | 1994 | 1 | 8.10 | 7 | 1 | -0.040133 |
3150 | DARJEELING | 27.050 | 88.270 | 1995 | 1 | 5.60 | 7 | 1 | -0.040133 |
3151 | DARJEELING | 27.050 | 88.270 | 1997 | 1 | 5.70 | 7 | 1 | -0.040133 |
3152 rows × 9 columns
We did it! We will now create a function that deals with plotting how the average yearly changes in temperature vary within a given country.
def temperature_coefficient_plot(country, year_begin, year_end, month, min_obs, **kwargs):
= query_climate_database(country = country,
df = year_begin,
year_begin = year_end,
year_end = month)
month "YearCount"] = df.groupby("NAME")["Year"].transform("count")
df[= df[(df["YearCount"] >= min_obs)]
df = df.groupby(["NAME", "Month"]).apply(coef)
coefs = coefs.reset_index()
coefs = coefs.rename(columns={0: "Estimated Yearly Increase (°C)"})
coefs = pd.merge(df, coefs, on="NAME")
df = df.round(4)
df = px.scatter_mapbox(data_frame=df,
fig ="LATITUDE",
lat="LONGITUDE",
lon="NAME",
hover_name="Estimated Yearly Increase (°C)",
color**kwargs
)return fig
= px.colors.diverging.RdGy_r
color_map = temperature_coefficient_plot("India", 1980, 2020, 1,
fig = 10,
min_obs = 2,
zoom ="carto-positron",
mapbox_style=color_map
color_continuous_scale
) fig.show()
This matches the plot that we were expecting to get! We can now see each station and look at the estimated yearly increase for each of them! Now let’s do another example, but for a different country.
= temperature_coefficient_plot("United States", 1960, 2010, 6,
fig = 12,
min_obs = 2,
zoom ="carto-positron",
mapbox_style=color_map
color_continuous_scale
) fig.show()
The plot looks great! It seems that the estimated yearly increase has a much higher range than that of the previous plot.