Home - NOAA Climate Data Analysis

NOAA Climate Data

Let’s study how extreme climate change is by using databases and Pandas!

We will start by importing all of the necessary packages we’ll need.

import pandas as pd
import sqlite3
import plotly.io as pio
from plotly import express as px
pio.renderers.default="iframe"

We will be creating a database with three tables: temperatures, stations, and countries. We first will transform the temperature data to make it more clean.

def prepare_df(df):
    df = df.set_index(keys=["ID", "Year"])
    df = df.stack()
    df = df.reset_index()
    df = df.rename(columns = {"level_2": "Month", 0: "Temp"})
    df["Month"] = df["Month"].str[5:].astype(int)
    df["Temp"] = df["Temp"]/100
    df["FIPS 10-4"] = df["ID"].str[:2]
    return df

We will now create a database with the filename climate.db

conn = sqlite3.connect("climate.db") #creates database in curr directory called climate.db

And now we have a file in our current directory called climate.db! We will now use the iterator ability of pandas to write the dataframe into the database

df_iter = pd.read_csv("temps.csv", chunksize=100000)
for df in df_iter:
    df = prepare_df(df)
    df.to_sql("temperatures", conn, if_exists="append", index=False)

Now we will add tables for the stations and countries. Since these aren’t nearly as large, we won’t be reading them in by chunks.

stations = pd.read_csv("station-metadata.csv")
stations.to_sql("stations", conn, if_exists="replace", index=False)

countries = pd.read_csv("countries.csv")
countries.to_sql("countries", conn, if_exists="replace", index=False)

Now that we’ve entered these as tables into our databases, we’ll look at the data and figure out how to query a climate database.

df.head() #part of the temperatures data

	ID	Year	Month	Temp	FIPS 10-4
0	USW00014924	2016	1	-13.69	US
1	USW00014924	2016	2	-8.40	US
2	USW00014924	2016	3	-0.20	US
3	USW00014924	2016	4	3.21	US
4	USW00014924	2016	5	13.85	US

stations.head()

	ID	LATITUDE	LONGITUDE	STNELEV	NAME
0	ACW00011604	57.7667	11.8667	18.0	SAVE
1	AE000041196	25.3330	55.5170	34.0	SHARJAH_INTER_AIRP
2	AEM00041184	25.6170	55.9330	31.0	RAS_AL_KHAIMAH_INTE
3	AEM00041194	25.2550	55.3640	10.4	DUBAI_INTL
4	AEM00041216	24.4300	54.4700	3.0	ABU_DHABI_BATEEN_AIR

countries.head()

	FIPS 10-4	ISO 3166	Name
0	AF	AF	Afghanistan
1	AX	-	Akrotiri
2	AL	AL	Albania
3	AG	DZ	Algeria
4	AQ	AS	American Samoa

Looks good! We will now set up our cursor for doing SQL commands from the tables, and check that we named them correctly

cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())

[('temperatures',), ('stations',), ('countries',)]

Now we will make sure that we correctly populated our tables.

cursor.execute("SELECT sql FROM sqlite_master WHERE type='table';")
for result in cursor.fetchall():
    print(result[0])

CREATE TABLE "temperatures" (
"ID" TEXT,
  "Year" INTEGER,
  "Month" INTEGER,
  "Temp" REAL,
  "FIPS 10-4" TEXT
)
CREATE TABLE "stations" (
"ID" TEXT,
  "LATITUDE" REAL,
  "LONGITUDE" REAL,
  "STNELEV" REAL,
  "NAME" TEXT
)
CREATE TABLE "countries" (
"FIPS 10-4" TEXT,
  "ISO 3166" TEXT,
  "Name" TEXT
)

We will test out if we can reproduce the sample output provided in the directions of the homework.

cmd = \
"""
SELECT S.name, S.latitude, S.longitude, T.year, T.month, T.temp
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON T."FIPS 10-4" = C."FIPS 10-4"
WHERE T.year >= 1980 AND T.year <= 2020 AND T.month = 1 AND C.name = "India";
"""
df = pd.read_sql_query(cmd, conn)
df

	NAME	LATITUDE	LONGITUDE	Year	Month	Temp
0	PBO_ANANTAPUR	14.583	77.633	1980	1	23.48
1	PBO_ANANTAPUR	14.583	77.633	1981	1	24.57
2	PBO_ANANTAPUR	14.583	77.633	1982	1	24.19
3	PBO_ANANTAPUR	14.583	77.633	1983	1	23.51
4	PBO_ANANTAPUR	14.583	77.633	1984	1	24.81
...	...	...	...	...	...	...
3147	DARJEELING	27.050	88.270	1983	1	5.10
3148	DARJEELING	27.050	88.270	1986	1	6.90
3149	DARJEELING	27.050	88.270	1994	1	8.10
3150	DARJEELING	27.050	88.270	1995	1	5.60
3151	DARJEELING	27.050	88.270	1997	1	5.70

3152 rows × 6 columns

We did it! We can now proceed and create the general function for querying based on the user inputs.

def query_climate_database(country, year_begin, year_end, month):
    cmd = \
    f"""
    SELECT S.name, S.latitude, S.longitude, T.year, T.month, T.temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON T."FIPS 10-4" = C."FIPS 10-4"
    WHERE T.year >= {year_begin} AND T.year <= {year_end} AND T.month = {month} 
    AND C.name = "{country}";
    """
    df = pd.read_sql_query(cmd, conn)
    return df

query_climate_database(country = "India", 
                       year_begin = 1980, 
                       year_end = 2020,
                       month = 1)

	NAME	LATITUDE	LONGITUDE	Year	Month	Temp
0	PBO_ANANTAPUR	14.583	77.633	1980	1	23.48
1	PBO_ANANTAPUR	14.583	77.633	1981	1	24.57
2	PBO_ANANTAPUR	14.583	77.633	1982	1	24.19
3	PBO_ANANTAPUR	14.583	77.633	1983	1	23.51
4	PBO_ANANTAPUR	14.583	77.633	1984	1	24.81
...	...	...	...	...	...	...
3147	DARJEELING	27.050	88.270	1983	1	5.10
3148	DARJEELING	27.050	88.270	1986	1	6.90
3149	DARJEELING	27.050	88.270	1994	1	8.10
3150	DARJEELING	27.050	88.270	1995	1	5.60
3151	DARJEELING	27.050	88.270	1997	1	5.70

3152 rows × 6 columns

We’ve done it! Now let’s test this for a different example just to make sure this is working.

query_climate_database(country = "United States", 
                       year_begin = 2000, 
                       year_end = 2010,
                       month = 12)

	NAME	LATITUDE	LONGITUDE	Year	Month	Temp
0	ADDISON	34.2553	-87.1814	2001	12	8.43
1	ADDISON	34.2553	-87.1814	2002	12	4.73
2	ADDISON	34.2553	-87.1814	2005	12	3.65
3	ADDISON	34.2553	-87.1814	2009	12	5.33
4	ADDISON	34.2553	-87.1814	2010	12	2.00
...	...	...	...	...	...	...
75667	LINCOLN_11_SW	40.6953	-96.8542	2006	12	1.45
75668	LINCOLN_11_SW	40.6953	-96.8542	2007	12	-4.18
75669	LINCOLN_11_SW	40.6953	-96.8542	2008	12	-4.88
75670	LINCOLN_11_SW	40.6953	-96.8542	2009	12	-6.97
75671	LINCOLN_11_SW	40.6953	-96.8542	2010	12	-3.46

75672 rows × 6 columns

df = query_climate_database(country = "India", 
                       year_begin = 1980, 
                       year_end = 2020,
                       month = 1)
df

	NAME	LATITUDE	LONGITUDE	Year	Month	Temp
0	PBO_ANANTAPUR	14.583	77.633	1980	1	23.48
1	PBO_ANANTAPUR	14.583	77.633	1981	1	24.57
2	PBO_ANANTAPUR	14.583	77.633	1982	1	24.19
3	PBO_ANANTAPUR	14.583	77.633	1983	1	23.51
4	PBO_ANANTAPUR	14.583	77.633	1984	1	24.81
...	...	...	...	...	...	...
3147	DARJEELING	27.050	88.270	1983	1	5.10
3148	DARJEELING	27.050	88.270	1986	1	6.90
3149	DARJEELING	27.050	88.270	1994	1	8.10
3150	DARJEELING	27.050	88.270	1995	1	5.60
3151	DARJEELING	27.050	88.270	1997	1	5.70

3152 rows × 6 columns

df["year_count"] = df.groupby("NAME")["Year"].transform("count")
df

	NAME	LATITUDE	LONGITUDE	Year	Month	Temp	year_count
0	PBO_ANANTAPUR	14.583	77.633	1980	1	23.48	34
1	PBO_ANANTAPUR	14.583	77.633	1981	1	24.57	34
2	PBO_ANANTAPUR	14.583	77.633	1982	1	24.19	34
3	PBO_ANANTAPUR	14.583	77.633	1983	1	23.51	34
4	PBO_ANANTAPUR	14.583	77.633	1984	1	24.81	34
...	...	...	...	...	...	...	...
3147	DARJEELING	27.050	88.270	1983	1	5.10	7
3148	DARJEELING	27.050	88.270	1986	1	6.90	7
3149	DARJEELING	27.050	88.270	1994	1	8.10	7
3150	DARJEELING	27.050	88.270	1995	1	5.60	7
3151	DARJEELING	27.050	88.270	1997	1	5.70	7

3152 rows × 7 columns

from sklearn.linear_model import LinearRegression

def coef(data_group):
    x = data_group[["Year"]] # 2 brackets because X should be a df
    y = data_group["Temp"]   # 1 bracket because y should be a series
    LR = LinearRegression()
    LR.fit(x, y)
    return LR.coef_[0]

coefs = df.groupby(["NAME", "Month"]).apply(coef)
coefs = coefs.reset_index()
coefs

	NAME	Month	0
0	AGARTALA	1	-0.006184
1	AGRA	1	-0.095413
2	AHMADABAD	1	0.006731
3	AKOLA	1	-0.008063
4	ALLAHABAD	1	-0.029375
...	...	...	...
99	TRIVANDRUM	1	0.022892
100	UDAIPUR_DABOK	1	0.072424
101	VARANASI_BABATPUR	1	-0.012996
102	VERAVAL	1	0.024848
103	VISHAKHAPATNAM	1	-0.034050

104 rows × 3 columns

pd.merge(df, coefs, on="NAME")

	NAME	LATITUDE	LONGITUDE	Year	Month_x	Temp	year_count	Month_y	0
0	PBO_ANANTAPUR	14.583	77.633	1980	1	23.48	34	1	0.026258
1	PBO_ANANTAPUR	14.583	77.633	1981	1	24.57	34	1	0.026258
2	PBO_ANANTAPUR	14.583	77.633	1982	1	24.19	34	1	0.026258
3	PBO_ANANTAPUR	14.583	77.633	1983	1	23.51	34	1	0.026258
4	PBO_ANANTAPUR	14.583	77.633	1984	1	24.81	34	1	0.026258
...	...	...	...	...	...	...	...	...	...
3147	DARJEELING	27.050	88.270	1983	1	5.10	7	1	-0.040133
3148	DARJEELING	27.050	88.270	1986	1	6.90	7	1	-0.040133
3149	DARJEELING	27.050	88.270	1994	1	8.10	7	1	-0.040133
3150	DARJEELING	27.050	88.270	1995	1	5.60	7	1	-0.040133
3151	DARJEELING	27.050	88.270	1997	1	5.70	7	1	-0.040133

3152 rows × 9 columns

We did it! We will now create a function that deals with plotting how the average yearly changes in temperature vary within a given country.

def temperature_coefficient_plot(country, year_begin, year_end, month, min_obs, **kwargs):
    df = query_climate_database(country = country, 
                       year_begin = year_begin, 
                       year_end = year_end,
                       month = month)
    df["YearCount"] = df.groupby("NAME")["Year"].transform("count")
    df = df[(df["YearCount"] >= min_obs)]
    coefs = df.groupby(["NAME", "Month"]).apply(coef)
    coefs = coefs.reset_index()
    coefs = coefs.rename(columns={0: "Estimated Yearly Increase (°C)"})
    df = pd.merge(df, coefs, on="NAME")
    df = df.round(4)
    fig = px.scatter_mapbox(data_frame=df,
                            lat="LATITUDE",
                            lon="LONGITUDE",
                            hover_name="NAME",
                            color="Estimated Yearly Increase (°C)",
                            **kwargs
                            )
    return fig

color_map = px.colors.diverging.RdGy_r
fig = temperature_coefficient_plot("India", 1980, 2020, 1, 
                                   min_obs = 10,
                                   zoom = 2,
                                   mapbox_style="carto-positron",
                                   color_continuous_scale=color_map
                                   )
fig.show()

This matches the plot that we were expecting to get! We can now see each station and look at the estimated yearly increase for each of them! Now let’s do another example, but for a different country.

fig = temperature_coefficient_plot("United States", 1960, 2010, 6, 
                                   min_obs = 12,
                                   zoom = 2,
                                   mapbox_style="carto-positron",
                                   color_continuous_scale=color_map
                                   )
fig.show()

The plot looks great! It seems that the estimated yearly increase has a much higher range than that of the previous plot.