pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
and see some of the fun stuff!
slides online at http://trescher.fr/pandas/
from __future__ import print_function, division
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
import pandas as pd
pd.set_option('display.max_rows', 8)
Data from DWD:
ftp://ftp-cdc.dwd.de/pub/CDC/observations_germany/climate/daily/kl/historical/
Daily data, they also have hourly / monthly data
Station IDs:
Berlin (Dahlem - FU) 403
Freiburg 1443
df1 = pd.read_csv('produkt_klima_Tageswerte_19500101_20141231_00403.txt',
sep=";",
index_col=1,
parse_dates=[1],
skipinitialspace=True)
df2 = pd.read_csv('produkt_klima_Tageswerte_19490501_20141231_01443.txt',
sep=";",
index_col=1,
parse_dates=[1],
skipinitialspace=True)
#last row invalid for obscure reasons
df1.drop(df1.index[-1], inplace=True)
df2.drop(df2.index[-1], inplace=True)
There is also:
df1
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MESS_DATUM | |||||||||||||||||
1950-01-01 | 403 | 5 | -3.2 | 4.0 | 5.0 | 1025.60 | 83.00 | -999 | -1.1 | -4.9 | -6.3 | -999 | 2.2 | 7 | -999.000 | 0 | eor |
1950-01-02 | 403 | 5 | 1.0 | 6.1 | 8.0 | 1005.60 | 95.00 | -999 | 2.2 | -3.7 | -5.3 | -999 | 12.6 | 8 | -999.000 | 0 | eor |
1950-01-03 | 403 | 5 | 2.8 | 6.5 | 5.0 | 996.60 | 86.00 | -999 | 3.9 | 1.7 | -1.4 | -999 | 0.5 | 1 | -999.000 | 0 | eor |
1950-01-04 | 403 | 5 | -0.1 | 5.2 | 7.7 | 999.50 | 85.00 | -999 | 2.1 | -0.9 | -2.3 | -999 | 0.5 | 7 | -999.000 | 0 | eor |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2014-12-28 | 403 | 3 | -5.1 | 3.8 | 4.6 | 1012.22 | 90.21 | -999 | -2.3 | -7.6 | -10.1 | -999 | 3.6 | 7 | 1.333 | 0 | eor |
2014-12-29 | 403 | 3 | -3.6 | 4.5 | 5.6 | 1018.05 | 95.92 | -999 | -1.9 | -6.0 | -13.8 | -999 | 0.0 | 7 | 6.117 | 11 | eor |
2014-12-30 | 403 | 3 | -1.8 | 5.2 | 7.9 | 1020.31 | 97.42 | -999 | 1.1 | -3.5 | -4.6 | -999 | 0.6 | 8 | 0.000 | 8 | eor |
2014-12-31 | 403 | 3 | 2.8 | 7.4 | 8.0 | 1019.90 | 99.00 | -999 | 4.9 | 1.1 | 0.0 | -999 | 0.9 | 6 | 0.000 | 6 | eor |
23741 rows × 17 columns
df1.columns
Index([u'STATIONS_ID', u'QUALITAETS_NIVEAU', u'LUFTTEMPERATUR', u'DAMPFDRUCK', u'BEDECKUNGSGRAD', u'LUFTDRUCK_STATIONSHOEHE', u'REL_FEUCHTE', u'WINDGESCHWINDIGKEIT', u'LUFTTEMPERATUR_MAXIMUM', u'LUFTTEMPERATUR_MINIMUM', u'LUFTTEMP_AM_ERDB_MINIMUM', u'WINDSPITZE_MAXIMUM', u'NIEDERSCHLAGSHOEHE', u'NIEDERSCHLAGSHOEHE_IND', u'SONNENSCHEINDAUER', u'SCHNEEHOEHE', u'eor'], dtype='object')
df1["LUFTTEMPERATUR"]
MESS_DATUM 1950-01-01 -3.2 1950-01-02 1.0 ... 2014-12-30 -1.8 2014-12-31 2.8 Name: LUFTTEMPERATUR, Length: 23741
it's called a "Series"
Label based
df1.loc["1970-01-01"]["LUFTTEMPERATUR"]
-12.800000000000001
and integer-index based
df1.iloc[120:123]
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MESS_DATUM | |||||||||||||||||
1950-05-01 | 403 | 5 | 16.1 | 9.5 | 0.7 | 1017.9 | 55 | -999 | 21.5 | 5.6 | 1.0 | -999 | 0.0 | 0 | 12.8 | 0 | eor |
1950-05-02 | 403 | 5 | 17.4 | 10.0 | 1.3 | 1011.6 | 53 | -999 | 23.9 | 10.5 | 6.7 | -999 | 0.2 | 1 | 12.6 | 0 | eor |
1950-05-03 | 403 | 5 | 11.2 | 11.6 | 6.7 | 1011.0 | 85 | -999 | 16.7 | 9.8 | 9.8 | -999 | 1.1 | 1 | 0.2 | 0 | eor |
df1["1970"]
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MESS_DATUM | |||||||||||||||||
1970-01-01 | 403 | 5 | -12.8 | 2.0 | 8.0 | 997.6 | 84 | -999 | -11.4 | -15.6 | -15.6 | -999 | 0.6 | 7 | 0.0 | 8 | eor |
1970-01-02 | 403 | 5 | -4.9 | 3.9 | 8.0 | 990.6 | 92 | -999 | -2.8 | -12.7 | -12.6 | -999 | 6.6 | 7 | 0.0 | 9 | eor |
1970-01-03 | 403 | 5 | -1.8 | 5.2 | 7.0 | 989.9 | 96 | -999 | 0.2 | -4.0 | -10.1 | -999 | 3.2 | 7 | 0.0 | 19 | eor |
1970-01-04 | 403 | 5 | -5.9 | 3.7 | 1.0 | 993.8 | 88 | -999 | -1.2 | -8.0 | -12.6 | -999 | 0.0 | 0 | 6.4 | 23 | eor |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1970-12-28 | 403 | 5 | -2.4 | 4.4 | 7.7 | 997.5 | 88 | -999 | -2.1 | -4.1 | -4.6 | -999 | 7.4 | 7 | 0.0 | 2 | eor |
1970-12-29 | 403 | 5 | -2.2 | 4.8 | 8.0 | 997.2 | 91 | -999 | -1.5 | -2.5 | -3.0 | -999 | 0.6 | 7 | 0.0 | 12 | eor |
1970-12-30 | 403 | 5 | -2.1 | 4.8 | 8.0 | 988.7 | 91 | -999 | -1.1 | -2.7 | -3.1 | -999 | 1.2 | 7 | 0.0 | 9 | eor |
1970-12-31 | 403 | 5 | -7.8 | 3.1 | 5.3 | 999.5 | 89 | -999 | -2.3 | -9.6 | -12.2 | -999 | 0.0 | 7 | 0.2 | 10 | eor |
365 rows × 17 columns
df1["1970":"1980"]
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MESS_DATUM | |||||||||||||||||
1970-01-01 | 403 | 5 | -12.8 | 2.0 | 8.0 | 997.6 | 84 | -999 | -11.4 | -15.6 | -15.6 | -999 | 0.6 | 7 | 0.0 | 8 | eor |
1970-01-02 | 403 | 5 | -4.9 | 3.9 | 8.0 | 990.6 | 92 | -999 | -2.8 | -12.7 | -12.6 | -999 | 6.6 | 7 | 0.0 | 9 | eor |
1970-01-03 | 403 | 5 | -1.8 | 5.2 | 7.0 | 989.9 | 96 | -999 | 0.2 | -4.0 | -10.1 | -999 | 3.2 | 7 | 0.0 | 19 | eor |
1970-01-04 | 403 | 5 | -5.9 | 3.7 | 1.0 | 993.8 | 88 | -999 | -1.2 | -8.0 | -12.6 | -999 | 0.0 | 0 | 6.4 | 23 | eor |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1980-12-28 | 403 | 5 | 1.0 | 5.4 | 7.7 | 1025.9 | 84 | -999 | 2.3 | -1.1 | -1.4 | -999 | 0.3 | 6 | 0.1 | 0 | eor |
1980-12-29 | 403 | 5 | 4.8 | 7.4 | 8.0 | 1024.3 | 87 | -999 | 5.7 | 1.4 | 0.4 | -999 | 0.2 | 6 | 0.0 | 0 | eor |
1980-12-30 | 403 | 5 | 6.8 | 8.4 | 8.0 | 1021.5 | 85 | -999 | 7.9 | 4.7 | 4.0 | -999 | 0.0 | 6 | 0.0 | 0 | eor |
1980-12-31 | 403 | 5 | 6.2 | 7.0 | 7.7 | 1008.2 | 73 | -999 | 7.4 | 5.3 | 5.3 | -999 | 3.7 | 6 | 0.0 | 0 | eor |
4018 rows × 17 columns
df1.describe()
QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741 | 23741.000000 | 23741.000000 | 23741.000000 | 23741 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 |
mean | 7.189756 | 9.228912 | 9.489082 | 5.205307 | 1007.985151 | 76.475301 | -999 | 13.351135 | 5.269264 | 3.511322 | -999 | 1.623912 | 3.093678 | -1.455907 | 0.817741 |
std | 2.643762 | 7.641879 | 7.712391 | 2.255881 | 9.136360 | 14.642451 | 0 | 8.967447 | 6.763194 | 14.702692 | 0 | 3.851226 | 3.103872 | 78.322974 | 3.437469 |
min | 3.000000 | -17.900000 | -999.000000 | 0.000000 | 961.000000 | -999.000000 | -999 | -16.400000 | -21.800000 | -999.000000 | -999 | 0.000000 | 0.000000 | -999.000000 | 0.000000 |
25% | 5.000000 | 3.400000 | 6.300000 | 3.700000 | 1002.400000 | 68.000000 | -999 | 6.200000 | 0.500000 | -0.900000 | -999 | 0.000000 | 0.000000 | 0.300000 | 0.000000 |
50% | 5.000000 | 9.400000 | 8.900000 | 5.700000 | 1008.300000 | 78.000000 | -999 | 13.600000 | 5.400000 | 3.800000 | -999 | 0.000000 | 1.000000 | 3.600000 | 0.000000 |
75% | 10.000000 | 15.300000 | 12.500000 | 7.300000 | 1013.900000 | 86.000000 | -999 | 20.400000 | 10.600000 | 9.000000 | -999 | 1.500000 | 6.000000 | 7.900000 | 0.000000 |
max | 10.000000 | 29.500000 | 23.900000 | 8.000000 | 1040.000000 | 100.000000 | -999 | 37.800000 | 21.300000 | 21.000000 | -999 | 106.000000 | 8.000000 | 16.367000 | 49.000000 |
access a single column
df1.SCHNEEHOEHE.describe().to_frame()
SCHNEEHOEHE | |
---|---|
count | 23741.000000 |
mean | 0.817741 |
std | 3.437469 |
min | 0.000000 |
25% | 0.000000 |
50% | 0.000000 |
75% | 0.000000 |
max | 49.000000 |
df1["SCHNEEHOEHE"] > 0
MESS_DATUM 1950-01-01 False 1950-01-02 False ... 2014-12-30 True 2014-12-31 True Name: SCHNEEHOEHE, Length: 23741
snow_days = df1[df1["SCHNEEHOEHE"] > 0]
snow_days
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MESS_DATUM | |||||||||||||||||
1950-01-06 | 403 | 5 | 2.6 | 5.6 | 7.3 | 997.50 | 79.00 | -999 | 5.0 | -4.0 | -4.0 | -999 | 7.2 | 8 | -999.000 | 12 | eor |
1950-01-11 | 403 | 5 | -7.2 | 3.1 | 7.0 | 1025.20 | 82.00 | -999 | -4.4 | -8.7 | -5.7 | -999 | 0.6 | 1 | -999.000 | 2 | eor |
1950-02-02 | 403 | 5 | -5.0 | 3.1 | 8.0 | 1017.80 | 74.00 | -999 | -3.5 | -6.9 | -7.7 | -999 | 0.0 | 0 | -999.000 | 2 | eor |
1950-02-03 | 403 | 5 | -2.0 | 3.5 | 7.3 | 1013.60 | 67.00 | -999 | 1.2 | -8.0 | -9.4 | -999 | 0.0 | 0 | -999.000 | 2 | eor |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2014-02-06 | 403 | 3 | 4.5 | 6.4 | 4.7 | 995.82 | 77.25 | -999 | 10.4 | 0.7 | -2.5 | -999 | 0.3 | 6 | 5.283 | 1 | eor |
2014-12-29 | 403 | 3 | -3.6 | 4.5 | 5.6 | 1018.05 | 95.92 | -999 | -1.9 | -6.0 | -13.8 | -999 | 0.0 | 7 | 6.117 | 11 | eor |
2014-12-30 | 403 | 3 | -1.8 | 5.2 | 7.9 | 1020.31 | 97.42 | -999 | 1.1 | -3.5 | -4.6 | -999 | 0.6 | 8 | 0.000 | 8 | eor |
2014-12-31 | 403 | 3 | 2.8 | 7.4 | 8.0 | 1019.90 | 99.00 | -999 | 4.9 | 1.1 | 0.0 | -999 | 0.9 | 6 | 0.000 | 6 | eor |
2517 rows × 17 columns
snow_days.describe()
QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2517.000000 | 2517.000000 | 2517.000000 | 2517.000000 | 2517.000000 | 2517.000000 | 2517 | 2517.000000 | 2517.000000 | 2517.000000 | 2517 | 2517.000000 | 2517.000000 | 2517.000000 | 2517.000000 |
mean | 7.028208 | -2.119031 | 4.593921 | 5.885936 | 1007.632932 | 83.989245 | -999 | 0.376480 | -4.994438 | -6.662495 | -999 | 1.211085 | 5.256257 | -3.809342 | 7.713151 |
std | 2.583502 | 4.070684 | 1.452455 | 2.300667 | 11.765287 | 9.092571 | 0 | 3.990027 | 4.586212 | 5.421881 | 0 | 2.347022 | 3.156976 | 77.125569 | 7.634561 |
min | 3.000000 | -17.900000 | 1.100000 | 0.000000 | 967.900000 | 44.000000 | -999 | -16.400000 | -21.800000 | -31.600000 | -999 | 0.000000 | 0.000000 | -999.000000 | 1.000000 |
25% | 5.000000 | -4.500000 | 3.500000 | 4.700000 | 999.300000 | 79.000000 | -999 | -1.900000 | -7.900000 | -9.800000 | -999 | 0.000000 | 1.000000 | 0.000000 | 2.000000 |
50% | 5.000000 | -1.500000 | 4.500000 | 6.800000 | 1008.000000 | 85.290000 | -999 | 0.700000 | -4.200000 | -5.600000 | -999 | 0.100000 | 7.000000 | 0.400000 | 5.000000 |
75% | 10.000000 | 0.600000 | 5.600000 | 7.800000 | 1016.300000 | 90.630000 | -999 | 2.800000 | -1.400000 | -2.400000 | -999 | 1.300000 | 7.000000 | 3.900000 | 11.000000 |
max | 10.000000 | 12.700000 | 11.300000 | 8.000000 | 1040.000000 | 100.000000 | -999 | 18.400000 | 8.800000 | 8.000000 | -999 | 21.900000 | 8.000000 | 11.667000 | 49.000000 |
snow_days["2000-01-01":"2014-01-01"]
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MESS_DATUM | |||||||||||||||||
2000-01-21 | 403 | 10 | -2.0 | 4.1 | 4.7 | 1006.90 | 73.00 | -999 | 4.5 | -3.8 | -0.7 | -999 | 0.3 | 7 | 4.70 | 4 | eor |
2000-01-22 | 403 | 10 | -3.1 | 4.1 | 7.7 | 1000.50 | 83.00 | -999 | -1.6 | -4.4 | -6.9 | -999 | 2.4 | 7 | 0.00 | 4 | eor |
2000-01-23 | 403 | 10 | -6.1 | 2.6 | 3.0 | 1006.10 | 69.00 | -999 | -2.6 | -8.7 | -13.2 | -999 | 0.1 | 7 | 7.60 | 6 | eor |
2000-01-24 | 403 | 10 | -6.4 | 3.1 | 5.7 | 1014.40 | 77.00 | -999 | -1.9 | -8.6 | -9.1 | -999 | 0.0 | 7 | 3.10 | 6 | eor |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2013-04-07 | 403 | 3 | 2.3 | 4.4 | 2.0 | 1009.70 | 64.67 | -999 | 9.2 | -4.6 | -5.8 | -999 | 0.0 | 6 | 8.75 | 6 | eor |
2013-04-08 | 403 | 3 | 3.7 | 4.6 | 4.9 | 1002.02 | 61.13 | -999 | 9.6 | -3.2 | -4.8 | -999 | 0.0 | 0 | 7.55 | 4 | eor |
2013-12-07 | 403 | 3 | 0.9 | 5.2 | 5.5 | 1008.05 | 80.46 | -999 | 1.6 | -0.1 | -1.6 | -999 | 0.1 | 8 | 0.40 | 1 | eor |
2013-12-08 | 403 | 3 | 3.6 | 7.5 | 8.0 | 1010.05 | 93.79 | -999 | 7.3 | 0.0 | -0.7 | -999 | 5.3 | 8 | 0.00 | 2 | eor |
562 rows × 17 columns
"Chengdu-pandas-d10". Licensed under CC BY-SA 2.5 es via Wikimedia Commons.
df1
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MESS_DATUM | |||||||||||||||||
1950-01-01 | 403 | 5 | -3.2 | 4.0 | 5.0 | 1025.60 | 83.00 | -999 | -1.1 | -4.9 | -6.3 | -999 | 2.2 | 7 | -999.000 | 0 | eor |
1950-01-02 | 403 | 5 | 1.0 | 6.1 | 8.0 | 1005.60 | 95.00 | -999 | 2.2 | -3.7 | -5.3 | -999 | 12.6 | 8 | -999.000 | 0 | eor |
1950-01-03 | 403 | 5 | 2.8 | 6.5 | 5.0 | 996.60 | 86.00 | -999 | 3.9 | 1.7 | -1.4 | -999 | 0.5 | 1 | -999.000 | 0 | eor |
1950-01-04 | 403 | 5 | -0.1 | 5.2 | 7.7 | 999.50 | 85.00 | -999 | 2.1 | -0.9 | -2.3 | -999 | 0.5 | 7 | -999.000 | 0 | eor |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2014-12-28 | 403 | 3 | -5.1 | 3.8 | 4.6 | 1012.22 | 90.21 | -999 | -2.3 | -7.6 | -10.1 | -999 | 3.6 | 7 | 1.333 | 0 | eor |
2014-12-29 | 403 | 3 | -3.6 | 4.5 | 5.6 | 1018.05 | 95.92 | -999 | -1.9 | -6.0 | -13.8 | -999 | 0.0 | 7 | 6.117 | 11 | eor |
2014-12-30 | 403 | 3 | -1.8 | 5.2 | 7.9 | 1020.31 | 97.42 | -999 | 1.1 | -3.5 | -4.6 | -999 | 0.6 | 8 | 0.000 | 8 | eor |
2014-12-31 | 403 | 3 | 2.8 | 7.4 | 8.0 | 1019.90 | 99.00 | -999 | 4.9 | 1.1 | 0.0 | -999 | 0.9 | 6 | 0.000 | 6 | eor |
23741 rows × 17 columns
df2
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MESS_DATUM | |||||||||||||||||
1949-05-01 | 1443 | 5 | 8.5 | 8.1 | 8.0 | 986.40 | 76.00 | -999.0 | 14.7 | 4.8 | 4.0 | -999.0 | 0.0 | 0 | 0.0 | 0 | eor |
1949-05-02 | 1443 | 5 | 13.2 | 9.9 | 7.7 | 984.30 | 66.00 | -999.0 | 15.2 | 9.5 | 9.0 | -999.0 | 0.0 | 0 | 0.5 | 0 | eor |
1949-05-03 | 1443 | 5 | 16.7 | 13.2 | 3.0 | 980.40 | 72.00 | -999.0 | 22.0 | 9.3 | 6.9 | -999.0 | 0.0 | 0 | 12.6 | 0 | eor |
1949-05-04 | 1443 | 5 | 17.3 | 14.1 | 6.7 | 974.20 | 72.00 | -999.0 | 22.6 | 10.6 | 9.0 | -999.0 | 0.3 | 1 | 8.8 | 0 | eor |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2014-12-28 | 1443 | 3 | -2.4 | 4.6 | 6.3 | 987.32 | 87.50 | 3.8 | 1.5 | -9.6 | -12.1 | 10.2 | 0.2 | 4 | 0.0 | 0 | eor |
2014-12-29 | 1443 | 3 | -6.0 | 3.6 | 5.6 | 1005.85 | 89.30 | 1.4 | -2.7 | -11.7 | -13.6 | 5.3 | 3.6 | 4 | 0.0 | -999 | eor |
2014-12-30 | 1443 | 3 | -1.7 | 5.1 | 7.9 | 1008.12 | 95.13 | 1.1 | -0.1 | -3.5 | -3.6 | 4.4 | 1.2 | 4 | 0.0 | 5 | eor |
2014-12-31 | 1443 | 3 | -0.7 | 5.6 | 7.8 | 1007.05 | 96.33 | 0.9 | 1.0 | -4.8 | -5.2 | 2.9 | 0.0 | 4 | 0.0 | 6 | eor |
23986 rows × 17 columns
df1.index
<class 'pandas.tseries.index.DatetimeIndex'> [1950-01-01, ..., 2014-12-31] Length: 23741, Freq: None, Timezone: None
Be careful: some pandas functionality expects a sorted DatetimeIndex without enforcing it!
Concatenate
(see the excellent documentation at: http://pandas.pydata.org/pandas-docs/stable/merging.html)
df = pd.concat([df1, df2], keys=['Berlin', 'Freiburg'], axis=1)
df
Berlin | ... | Freiburg | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | ... | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |
MESS_DATUM | |||||||||||||||||||||
1949-05-01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | -999.0 | 14.7 | 4.8 | 4.0 | -999.0 | 0.0 | 0 | 0.0 | 0 | eor |
1949-05-02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | -999.0 | 15.2 | 9.5 | 9.0 | -999.0 | 0.0 | 0 | 0.5 | 0 | eor |
1949-05-03 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | -999.0 | 22.0 | 9.3 | 6.9 | -999.0 | 0.0 | 0 | 12.6 | 0 | eor |
1949-05-04 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | -999.0 | 22.6 | 10.6 | 9.0 | -999.0 | 0.3 | 1 | 8.8 | 0 | eor |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2014-12-28 | 403 | 3 | -5.1 | 3.8 | 4.6 | 1012.22 | 90.21 | -999 | -2.3 | -7.6 | ... | 3.8 | 1.5 | -9.6 | -12.1 | 10.2 | 0.2 | 4 | 0.0 | 0 | eor |
2014-12-29 | 403 | 3 | -3.6 | 4.5 | 5.6 | 1018.05 | 95.92 | -999 | -1.9 | -6.0 | ... | 1.4 | -2.7 | -11.7 | -13.6 | 5.3 | 3.6 | 4 | 0.0 | -999 | eor |
2014-12-30 | 403 | 3 | -1.8 | 5.2 | 7.9 | 1020.31 | 97.42 | -999 | 1.1 | -3.5 | ... | 1.1 | -0.1 | -3.5 | -3.6 | 4.4 | 1.2 | 4 | 0.0 | 5 | eor |
2014-12-31 | 403 | 3 | 2.8 | 7.4 | 8.0 | 1019.90 | 99.00 | -999 | 4.9 | 1.1 | ... | 0.9 | 1.0 | -4.8 | -5.2 | 2.9 | 0.0 | 4 | 0.0 | 6 | eor |
23986 rows × 34 columns
Now we got a hierarchichal index http://pandas.pydata.org/pandas-docs/stable/advanced.html#advanced
df = df.swaplevel(0,1, axis=1)
df
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | ... | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Berlin | Berlin | Berlin | Berlin | Berlin | Berlin | Berlin | Berlin | Berlin | Berlin | ... | Freiburg | Freiburg | Freiburg | Freiburg | Freiburg | Freiburg | Freiburg | Freiburg | Freiburg | Freiburg | |
MESS_DATUM | |||||||||||||||||||||
1949-05-01 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | -999.0 | 14.7 | 4.8 | 4.0 | -999.0 | 0.0 | 0 | 0.0 | 0 | eor |
1949-05-02 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | -999.0 | 15.2 | 9.5 | 9.0 | -999.0 | 0.0 | 0 | 0.5 | 0 | eor |
1949-05-03 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | -999.0 | 22.0 | 9.3 | 6.9 | -999.0 | 0.0 | 0 | 12.6 | 0 | eor |
1949-05-04 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | -999.0 | 22.6 | 10.6 | 9.0 | -999.0 | 0.3 | 1 | 8.8 | 0 | eor |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2014-12-28 | 403 | 3 | -5.1 | 3.8 | 4.6 | 1012.22 | 90.21 | -999 | -2.3 | -7.6 | ... | 3.8 | 1.5 | -9.6 | -12.1 | 10.2 | 0.2 | 4 | 0.0 | 0 | eor |
2014-12-29 | 403 | 3 | -3.6 | 4.5 | 5.6 | 1018.05 | 95.92 | -999 | -1.9 | -6.0 | ... | 1.4 | -2.7 | -11.7 | -13.6 | 5.3 | 3.6 | 4 | 0.0 | -999 | eor |
2014-12-30 | 403 | 3 | -1.8 | 5.2 | 7.9 | 1020.31 | 97.42 | -999 | 1.1 | -3.5 | ... | 1.1 | -0.1 | -3.5 | -3.6 | 4.4 | 1.2 | 4 | 0.0 | 5 | eor |
2014-12-31 | 403 | 3 | 2.8 | 7.4 | 8.0 | 1019.90 | 99.00 | -999 | 4.9 | 1.1 | ... | 0.9 | 1.0 | -4.8 | -5.2 | 2.9 | 0.0 | 4 | 0.0 | 6 | eor |
23986 rows × 34 columns
df = pd.concat([df1, df2], keys=['Berlin', 'Freiburg'])
df
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | REL_FEUCHTE | WINDGESCHWINDIGKEIT | LUFTTEMPERATUR_MAXIMUM | LUFTTEMPERATUR_MINIMUM | LUFTTEMP_AM_ERDB_MINIMUM | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MESS_DATUM | ||||||||||||||||||
Berlin | 1950-01-01 | 403 | 5 | -3.2 | 4.0 | 5.0 | 1025.60 | 83.00 | -999.0 | -1.1 | -4.9 | -6.3 | -999.0 | 2.2 | 7 | -999 | 0 | eor |
1950-01-02 | 403 | 5 | 1.0 | 6.1 | 8.0 | 1005.60 | 95.00 | -999.0 | 2.2 | -3.7 | -5.3 | -999.0 | 12.6 | 8 | -999 | 0 | eor | |
1950-01-03 | 403 | 5 | 2.8 | 6.5 | 5.0 | 996.60 | 86.00 | -999.0 | 3.9 | 1.7 | -1.4 | -999.0 | 0.5 | 1 | -999 | 0 | eor | |
1950-01-04 | 403 | 5 | -0.1 | 5.2 | 7.7 | 999.50 | 85.00 | -999.0 | 2.1 | -0.9 | -2.3 | -999.0 | 0.5 | 7 | -999 | 0 | eor | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Freiburg | 2014-12-28 | 1443 | 3 | -2.4 | 4.6 | 6.3 | 987.32 | 87.50 | 3.8 | 1.5 | -9.6 | -12.1 | 10.2 | 0.2 | 4 | 0 | 0 | eor |
2014-12-29 | 1443 | 3 | -6.0 | 3.6 | 5.6 | 1005.85 | 89.30 | 1.4 | -2.7 | -11.7 | -13.6 | 5.3 | 3.6 | 4 | 0 | -999 | eor | |
2014-12-30 | 1443 | 3 | -1.7 | 5.1 | 7.9 | 1008.12 | 95.13 | 1.1 | -0.1 | -3.5 | -3.6 | 4.4 | 1.2 | 4 | 0 | 5 | eor | |
2014-12-31 | 1443 | 3 | -0.7 | 5.6 | 7.8 | 1007.05 | 96.33 | 0.9 | 1.0 | -4.8 | -5.2 | 2.9 | 0.0 | 4 | 0 | 6 | eor |
47727 rows × 17 columns
df = df.unstack(level=0)
df
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | ... | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | ... | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | |
MESS_DATUM | |||||||||||||||||||||
1949-05-01 | NaN | 1443 | NaN | 5 | NaN | 8.5 | NaN | 8.1 | NaN | 8.0 | ... | NaN | 0.0 | NaN | 0 | NaN | 0.0 | NaN | 0 | NaN | eor |
1949-05-02 | NaN | 1443 | NaN | 5 | NaN | 13.2 | NaN | 9.9 | NaN | 7.7 | ... | NaN | 0.0 | NaN | 0 | NaN | 0.5 | NaN | 0 | NaN | eor |
1949-05-03 | NaN | 1443 | NaN | 5 | NaN | 16.7 | NaN | 13.2 | NaN | 3.0 | ... | NaN | 0.0 | NaN | 0 | NaN | 12.6 | NaN | 0 | NaN | eor |
1949-05-04 | NaN | 1443 | NaN | 5 | NaN | 17.3 | NaN | 14.1 | NaN | 6.7 | ... | NaN | 0.3 | NaN | 1 | NaN | 8.8 | NaN | 0 | NaN | eor |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2014-12-28 | 403 | 1443 | 3 | 3 | -5.1 | -2.4 | 3.8 | 4.6 | 4.6 | 6.3 | ... | 3.6 | 0.2 | 7 | 4 | 1.333 | 0.0 | 0 | 0 | eor | eor |
2014-12-29 | 403 | 1443 | 3 | 3 | -3.6 | -6.0 | 4.5 | 3.6 | 5.6 | 5.6 | ... | 0.0 | 3.6 | 7 | 4 | 6.117 | 0.0 | 11 | -999 | eor | eor |
2014-12-30 | 403 | 1443 | 3 | 3 | -1.8 | -1.7 | 5.2 | 5.1 | 7.9 | 7.9 | ... | 0.6 | 1.2 | 8 | 4 | 0.000 | 0.0 | 8 | 5 | eor | eor |
2014-12-31 | 403 | 1443 | 3 | 3 | 2.8 | -0.7 | 7.4 | 5.6 | 8.0 | 7.8 | ... | 0.9 | 0.0 | 6 | 4 | 0.000 | 0.0 | 6 | 6 | eor | eor |
23986 rows × 34 columns
But data for Freiburg starts mid 49, we want only full years, so we just "say":
df = df.truncate(before="1950-01-01")
df
STATIONS_ID | QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | ... | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | eor | |||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | ... | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | |
MESS_DATUM | |||||||||||||||||||||
1950-01-01 | 403 | 1443 | 5 | 5 | -3.2 | -2.9 | 4.0 | 4.3 | 5.0 | 1.3 | ... | 2.2 | 0.0 | 7 | 0 | -999.000 | 4.6 | 0 | 0 | eor | eor |
1950-01-02 | 403 | 1443 | 5 | 5 | 1.0 | 2.3 | 6.1 | 3.7 | 8.0 | 5.7 | ... | 12.6 | 6.2 | 8 | 8 | -999.000 | 1.0 | 0 | 0 | eor | eor |
1950-01-03 | 403 | 1443 | 5 | 5 | 2.8 | 2.4 | 6.5 | 6.5 | 5.0 | 8.0 | ... | 0.5 | 6.2 | 1 | 8 | -999.000 | 0.0 | 0 | 0 | eor | eor |
1950-01-04 | 403 | 1443 | 5 | 5 | -0.1 | 2.4 | 5.2 | 6.8 | 7.7 | 8.0 | ... | 0.5 | 9.0 | 7 | 8 | -999.000 | 0.0 | 0 | 0 | eor | eor |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2014-12-28 | 403 | 1443 | 3 | 3 | -5.1 | -2.4 | 3.8 | 4.6 | 4.6 | 6.3 | ... | 3.6 | 0.2 | 7 | 4 | 1.333 | 0.0 | 0 | 0 | eor | eor |
2014-12-29 | 403 | 1443 | 3 | 3 | -3.6 | -6.0 | 4.5 | 3.6 | 5.6 | 5.6 | ... | 0.0 | 3.6 | 7 | 4 | 6.117 | 0.0 | 11 | -999 | eor | eor |
2014-12-30 | 403 | 1443 | 3 | 3 | -1.8 | -1.7 | 5.2 | 5.1 | 7.9 | 7.9 | ... | 0.6 | 1.2 | 8 | 4 | 0.000 | 0.0 | 8 | 5 | eor | eor |
2014-12-31 | 403 | 1443 | 3 | 3 | 2.8 | -0.7 | 7.4 | 5.6 | 8.0 | 7.8 | ... | 0.9 | 0.0 | 6 | 4 | 0.000 | 0.0 | 6 | 6 | eor | eor |
23741 rows × 34 columns
df.describe()
QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | ... | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | |||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | ... | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | |
count | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | ... | 23741 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 | 23741.000000 |
mean | 7.189756 | 6.287014 | 9.228912 | 10.897077 | 9.489082 | 9.409667 | 5.205307 | -109.661084 | 1007.985151 | 982.044536 | ... | -999 | -68.607527 | 1.623912 | 2.378960 | 3.093678 | 2.327071 | -1.455907 | 4.585664 | 0.817741 | -127.160777 |
std | 2.643762 | 2.726823 | 7.641879 | 11.992317 | 7.712391 | 25.693188 | 2.255881 | 319.922454 | 9.136360 | 45.276124 | ... | 0 | 270.671541 | 3.851226 | 14.065206 | 3.103872 | 13.290091 | 78.322974 | 16.561068 | 3.437469 | 333.418686 |
min | 3.000000 | 3.000000 | -17.900000 | -999.000000 | -999.000000 | -999.000000 | 0.000000 | -999.000000 | 961.000000 | -999.000000 | ... | -999 | -999.000000 | 0.000000 | -999.000000 | 0.000000 | -999.000000 | -999.000000 | -999.000000 | 0.000000 | -999.000000 |
25% | 5.000000 | 5.000000 | 3.400000 | 5.300000 | 6.300000 | 6.700000 | 3.700000 | 2.400000 | 1002.400000 | 978.200000 | ... | -999 | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.300000 | 0.400000 | 0.000000 | 0.000000 |
50% | 5.000000 | 5.000000 | 9.400000 | 11.300000 | 8.900000 | 9.400000 | 5.700000 | 5.700000 | 1008.300000 | 983.400000 | ... | -999 | 8.900000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 3.600000 | 3.900000 | 0.000000 | 0.000000 |
75% | 10.000000 | 10.000000 | 15.300000 | 16.900000 | 12.500000 | 13.100000 | 7.300000 | 7.300000 | 1013.900000 | 988.300000 | ... | -999 | 12.400000 | 1.500000 | 2.700000 | 6.000000 | 6.000000 | 7.900000 | 8.400000 | 0.000000 | 0.000000 |
max | 10.000000 | 10.000000 | 29.500000 | 30.800000 | 23.900000 | 23.100000 | 8.000000 | 8.000000 | 1040.000000 | 1012.700000 | ... | -999 | 36.400000 | 106.000000 | 80.300000 | 8.000000 | 8.000000 | 16.367000 | 15.200000 | 49.000000 | 40.000000 |
8 rows × 30 columns
But the -999 value is clearly not valid, we replace it with "NaN"
df.replace(-999, np.nan, inplace=True)
df.describe()
QUALITAETS_NIVEAU | LUFTTEMPERATUR | DAMPFDRUCK | BEDECKUNGSGRAD | LUFTDRUCK_STATIONSHOEHE | ... | WINDSPITZE_MAXIMUM | NIEDERSCHLAGSHOEHE | NIEDERSCHLAGSHOEHE_IND | SONNENSCHEINDAUER | SCHNEEHOEHE | |||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | ... | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | Berlin | Freiburg | |
count | 23741.000000 | 23741.000000 | 23741.000000 | 23739.000000 | 23740.000000 | 23726.000000 | 23741.000000 | 21021.000000 | 23741.000000 | 23729.000000 | ... | 0 | 21889.000000 | 23741.000000 | 23737.000000 | 23741.000000 | 23737.000000 | 23596.000000 | 23735.000000 | 23741.000000 | 20712.000000 |
mean | 7.189756 | 6.287014 | 9.228912 | 10.982160 | 9.531563 | 10.047201 | 5.205307 | 5.414405 | 1007.985151 | 983.046371 | ... | NaN | 10.111778 | 1.623912 | 2.547706 | 3.093678 | 2.495808 | 4.674110 | 4.839362 | 0.817741 | 0.340238 |
std | 2.643762 | 2.726823 | 7.641879 | 7.608645 | 4.079052 | 4.150623 | 2.255881 | 2.365689 | 9.136360 | 8.074512 | ... | NaN | 4.674865 | 3.851226 | 5.371165 | 3.103872 | 2.767641 | 4.402997 | 4.433495 | 3.437469 | 1.839556 |
min | 3.000000 | 3.000000 | -17.900000 | -15.400000 | 1.100000 | 1.200000 | 0.000000 | 0.000000 | 961.000000 | 939.200000 | ... | NaN | 0.300000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 5.000000 | 5.000000 | 3.400000 | 5.300000 | 6.300000 | 6.700000 | 3.700000 | 3.700000 | 1002.400000 | 978.200000 | ... | NaN | 6.700000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.400000 | 0.400000 | 0.000000 | 0.000000 |
50% | 5.000000 | 5.000000 | 9.400000 | 11.300000 | 8.900000 | 9.400000 | 5.700000 | 6.000000 | 1008.300000 | 983.400000 | ... | NaN | 9.300000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 3.700000 | 3.900000 | 0.000000 | 0.000000 |
75% | 10.000000 | 10.000000 | 15.300000 | 16.900000 | 12.500000 | 13.100000 | 7.300000 | 7.300000 | 1013.900000 | 988.300000 | ... | NaN | 12.800000 | 1.500000 | 2.700000 | 6.000000 | 6.000000 | 7.900000 | 8.400000 | 0.000000 | 0.000000 |
max | 10.000000 | 10.000000 | 29.500000 | 30.800000 | 23.900000 | 23.100000 | 8.000000 | 8.000000 | 1040.000000 | 1012.700000 | ... | NaN | 36.400000 | 106.000000 | 80.300000 | 8.000000 | 8.000000 | 16.367000 | 15.200000 | 49.000000 | 40.000000 |
8 rows × 30 columns
From now on we consider only "Lufttemperatur"
airtemp = df.LUFTTEMPERATUR
airtemp.describe()
Berlin | Freiburg | |
---|---|---|
count | 23741.000000 | 23739.000000 |
mean | 9.228912 | 10.982160 |
std | 7.641879 | 7.608645 |
min | -17.900000 | -15.400000 |
25% | 3.400000 | 5.300000 |
50% | 9.400000 | 11.300000 |
75% | 15.300000 | 16.900000 |
max | 29.500000 | 30.800000 |
airtemp.plot(fontsize=16)
<matplotlib.axes._subplots.AxesSubplot at 0x7effcd806bd0>
ax = airtemp.boxplot(return_type='axes', fontsize=20)
dfyear=airtemp.groupby(airtemp.index.year).mean()
dfyear
Berlin | Freiburg | |
---|---|---|
1950 | 9.330685 | 10.961370 |
1951 | 9.680274 | 10.835342 |
1952 | 8.423770 | 10.627322 |
1953 | 10.038082 | 10.667671 |
... | ... | ... |
2011 | 10.043288 | 11.530137 |
2012 | 9.545902 | 10.943443 |
2013 | 9.395890 | 10.352603 |
2014 | 11.044384 | 11.874795 |
65 rows × 2 columns
dfyear.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7effccd07ad0>
pd.rolling_mean(dfyear, 10).plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7effccf1ed50>
snow_days.groupby(snow_days.index.year)['SCHNEEHOEHE'].count().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7effccf13090>
snow_days.groupby(snow_days.index.year)['SCHNEEHOEHE'].count().to_frame()[-7:]
SCHNEEHOEHE | |
---|---|
2008 | 14 |
2009 | 52 |
2010 | 93 |
2011 | 19 |
2012 | 36 |
2013 | 69 |
2014 | 19 |
airtemp.corr()
Berlin | Freiburg | |
---|---|---|
Berlin | 1.00000 | 0.91505 |
Freiburg | 0.91505 | 1.00000 |
import statsmodels.formula.api as smf
import statsmodels.api as sm
dfyear['year'] = dfyear.index
dfyear
Berlin | Freiburg | year | |
---|---|---|---|
1950 | 9.330685 | 10.961370 | 1950 |
1951 | 9.680274 | 10.835342 | 1951 |
1952 | 8.423770 | 10.627322 | 1952 |
1953 | 10.038082 | 10.667671 | 1953 |
... | ... | ... | ... |
2011 | 10.043288 | 11.530137 | 2011 |
2012 | 9.545902 | 10.943443 | 2012 |
2013 | 9.395890 | 10.352603 | 2013 |
2014 | 11.044384 | 11.874795 | 2014 |
65 rows × 3 columns
berlin_model = smf.ols(formula="Berlin ~ year", data=dfyear).fit()
freiburg_model = smf.ols(formula="Freiburg ~ year", data=dfyear).fit()
berlin_model.summary()
Dep. Variable: | Berlin | R-squared: | 0.233 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.221 |
Method: | Least Squares | F-statistic: | 19.18 |
Date: | Thu, 13 Aug 2015 | Prob (F-statistic): | 4.59e-05 |
Time: | 15:28:39 | Log-Likelihood: | -70.925 |
No. Observations: | 65 | AIC: | 145.8 |
Df Residuals: | 63 | BIC: | 150.2 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | -32.7638 | 9.590 | -3.416 | 0.001 | -51.928 -13.600 |
year | 0.0212 | 0.005 | 4.379 | 0.000 | 0.012 0.031 |
Omnibus: | 1.993 | Durbin-Watson: | 1.610 |
---|---|---|---|
Prob(Omnibus): | 0.369 | Jarque-Bera (JB): | 1.916 |
Skew: | -0.342 | Prob(JB): | 0.384 |
Kurtosis: | 2.510 | Cond. No. | 2.09e+05 |
freiburg_model.summary()
Dep. Variable: | Freiburg | R-squared: | 0.278 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.267 |
Method: | Least Squares | F-statistic: | 24.29 |
Date: | Thu, 13 Aug 2015 | Prob (F-statistic): | 6.32e-06 |
Time: | 15:28:39 | Log-Likelihood: | -68.552 |
No. Observations: | 65 | AIC: | 141.1 |
Df Residuals: | 63 | BIC: | 145.5 |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | -34.5893 | 9.246 | -3.741 | 0.000 | -53.067 -16.112 |
year | 0.0230 | 0.005 | 4.929 | 0.000 | 0.014 0.032 |
Omnibus: | 1.153 | Durbin-Watson: | 1.289 |
---|---|---|---|
Prob(Omnibus): | 0.562 | Jarque-Bera (JB): | 1.149 |
Skew: | -0.301 | Prob(JB): | 0.563 |
Kurtosis: | 2.753 | Cond. No. | 2.09e+05 |
import seaborn as sns
sns.set(style="ticks")
# Load the example dataset for Anscombe's quartet
df = sns.load_dataset("anscombe")
# Show the results of a linear regression within each dataset
sns.lmplot(x="year", y="Berlin", data=dfyear,
size=8,
scatter_kws={"s": 50, "alpha": 1})
plt.plot(dfyear.index, dfyear.Berlin)
[<matplotlib.lines.Line2D at 0x7effc08bdbd0>]
sns.lmplot(x="year", y="Freiburg", data=dfyear,
size=8,
scatter_kws={"s": 50, "alpha": 1})
plt.plot(dfyear.index, dfyear.Freiburg)
[<matplotlib.lines.Line2D at 0x7effbec9ee10>]
"Xiao Liwu im San Diego Zoo - Foto 3" by User:jballeis - Own work. Licensed under CC BY-SA 3.0 via Wikimedia Commons.
df.columns
Index([u'dataset', u'x', u'y'], dtype='object')
gg=airtemp.groupby([airtemp.index.month, airtemp.index.year]).mean()
gg
Berlin | Freiburg | ||
---|---|---|---|
1 | 1950 | -1.383871 | 0.948387 |
1951 | 1.096774 | 4.222581 | |
1952 | 1.187097 | 1.590323 | |
1953 | 0.667742 | -1.019355 | |
... | ... | ... | ... |
12 | 2011 | 4.370968 | 5.990323 |
2012 | 0.554839 | 4.183871 | |
2013 | 4.248387 | 3.154839 | |
2014 | 2.751613 | 4.841935 |
780 rows × 2 columns
gg.loc[1].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7effbe4d1690>
gg.loc[10].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7effbcc21290>
gg=airtemp.groupby([airtemp.index.month]).mean()
gg.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7effcd289350>
gg=airtemp.groupby([airtemp.index.dayofyear]).mean()
gg.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7effbcaa4dd0>