import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn
from scipy import stats
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error
import plotly.graph_objs as go
from plotly.offline import iplot

from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor


happiness = pd.read_csv("2016.csv")
happiness.head()


print("It has {} rows and {} columns.".format(happiness.shape[0], happiness.shape[1]))
happiness.dtypes

It has 157 rows and 13 columns.

Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Lower Confidence Interval        float64
Upper Confidence Interval        float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
dtype: object


alcoholCSV = pd.read_csv("HappinessAlcoholConsumption.csv")
alcoholCSV = alcoholCSV.drop(["Region", "Hemisphere", "HappinessScore", "HDI", "GDP_PerCapita"], axis=1)
alcoholCSV.head()


print("It has {} rows and {} columns.".format(alcoholCSV.shape[0], alcoholCSV.shape[1]))
alcoholCSV.dtypes

It has 122 rows and 4 columns.

Country             object
Beer_PerCapita       int64
Spirit_PerCapita     int64
Wine_PerCapita       int64
dtype: object


combinedDatasets = happiness.merge(alcoholCSV, left_on="Country", right_on="Country")
combinedDatasets.head()


dataset = combinedDatasets.copy()


print("It has {} rows and {} columns.".format(dataset.shape[0], dataset.shape[1]))
dataset.dtypes

It has 118 rows and 16 columns.

Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Lower Confidence Interval        float64
Upper Confidence Interval        float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
Beer_PerCapita                     int64
Spirit_PerCapita                   int64
Wine_PerCapita                     int64
dtype: object


dataset.isnull().sum()

Country                          0
Region                           0
Happiness Rank                   0
Happiness Score                  0
Lower Confidence Interval        0
Upper Confidence Interval        0
Economy (GDP per Capita)         0
Family                           0
Health (Life Expectancy)         0
Freedom                          0
Trust (Government Corruption)    0
Generosity                       0
Dystopia Residual                0
Beer_PerCapita                   0
Spirit_PerCapita                 0
Wine_PerCapita                   0
dtype: int64


dataset.dtypes

Country                           object
Region                            object
Happiness Rank                     int64
Happiness Score                  float64
Lower Confidence Interval        float64
Upper Confidence Interval        float64
Economy (GDP per Capita)         float64
Family                           float64
Health (Life Expectancy)         float64
Freedom                          float64
Trust (Government Corruption)    float64
Generosity                       float64
Dystopia Residual                float64
Beer_PerCapita                     int64
Spirit_PerCapita                   int64
Wine_PerCapita                     int64
dtype: object


dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118 entries, 0 to 117
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        118 non-null    object 
 1   Region                         118 non-null    object 
 2   Happiness Rank                 118 non-null    int64  
 3   Happiness Score                118 non-null    float64
 4   Lower Confidence Interval      118 non-null    float64
 5   Upper Confidence Interval      118 non-null    float64
 6   Economy (GDP per Capita)       118 non-null    float64
 7   Family                         118 non-null    float64
 8   Health (Life Expectancy)       118 non-null    float64
 9   Freedom                        118 non-null    float64
 10  Trust (Government Corruption)  118 non-null    float64
 11  Generosity                     118 non-null    float64
 12  Dystopia Residual              118 non-null    float64
 13  Beer_PerCapita                 118 non-null    int64  
 14  Spirit_PerCapita               118 non-null    int64  
 15  Wine_PerCapita                 118 non-null    int64  
dtypes: float64(10), int64(4), object(2)
memory usage: 15.7+ KB


dataset.iloc[:, 3:].describe()


dataset.iloc[:, :3].plot.box()

<AxesSubplot:>


dataset.iloc[:, 3:13].plot.box(figsize=(20,12))

<AxesSubplot:>


def outliers(column):
    '''
    Return outliers for a given column name
    '''
    Q1 = dataset[column].quantile(0.25)
    Q3 = dataset[column].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    return dataset[(dataset[column] < Q1 - 1.5 * IQR) | (dataset[column] > Q3 + 1.5 *IQR)]

outliers("Family")


outliers("Trust (Government Corruption)")


outliers("Generosity")


dataset.iloc[:, 13:].plot.box(figsize=(20,12))

<AxesSubplot:>


outliers('Spirit_PerCapita')


outliers('Wine_PerCapita')


dataset[(dataset['Wine_PerCapita'] < 3)]


dataset.plot(subplots=True, figsize=(8, 8));


#https://levelup.gitconnected.com/plotting-choropleth-maps-in-python-b74c53b8d0a6
def featureWorldMap(feature, title):
    '''
    Returns a world heatmap of a given feature
    '''
    return go.Figure(
        data = {
            'type':'choropleth',
            'locations':dataset['Country'],
            'locationmode':'country names',
            'colorscale':["darkred","red","lightcoral","white", "palegreen","green","darkgreen"],
            'z':dataset[feature],
            'colorbar':{'title': title},
            'marker': {
                'line': {
                    'color':'rgb(255,255,255)',
                    'width':2
                }
            }
        },     
        layout = {      
          'geo':{
              'scope':'world', 
          }  
        }
    )


featureWorldMap("Happiness Score", "Happiness Score")


featureWorldMap("Beer_PerCapita", "Beer consumption")


featureWorldMap("Spirit_PerCapita", "Spirit  consumption")


featureWorldMap("Wine_PerCapita", "Wine  consumption")


data_corr = dataset.iloc[:,[3,6,7,8,9,10,11,13,14,15]]
# pairplot
seaborn.pairplot(data_corr)
# to show
plt.show()


print("happiness and Economy\t correlation is {}.".format(dataset.iloc[:,3].corr(dataset.iloc[:,6])))
print("happiness and Family\t correlation is {}.".format(dataset.iloc[:,3].corr(dataset.iloc[:,7])))
print("happiness and Health\t correlation is {}.".format(dataset.iloc[:,3].corr(dataset.iloc[:,8])))
print("happiness and Freedom\t correlation is {}.".format(dataset.iloc[:,3].corr(dataset.iloc[:,9])))
print("happiness and Trust\t correlation is {}.".format(dataset.iloc[:,3].corr(dataset.iloc[:,10])))
print("happiness and Generosity correlation is {}.".format(dataset.iloc[:,3].corr(dataset.iloc[:,11])))
print("happiness and Beer\t correlation is {}.".format(dataset.iloc[:,3].corr(dataset.iloc[:,13])))
print("happiness and Spirit\t correlation is {}.".format(dataset.iloc[:,3].corr(dataset.iloc[:,14])))
print("happiness and Wine\t correlation is {}.".format(dataset.iloc[:,3].corr(dataset.iloc[:,15])))

happiness and Economy	 correlation is 0.7896671254192654.
happiness and Family	 correlation is 0.7509180595792792.
happiness and Health	 correlation is 0.7470862678123842.
happiness and Freedom	 correlation is 0.6183532344146925.
happiness and Trust	 correlation is 0.5161251180353214.
happiness and Generosity correlation is 0.3067203229436911.
happiness and Beer	 correlation is 0.4807494310940025.
happiness and Spirit	 correlation is 0.23063681111279222.
happiness and Wine	 correlation is 0.44042535523016324.


labels = data_corr.columns

seaborn.heatmap(data_corr.corr(),
             mask=np.triu(np.ones_like(data_corr.corr())),
             annot=True,
             xticklabels=labels,
             yticklabels=labels)\
    .set_title('Correlation between happiness and features')

plt.xticks(rotation=45)

(array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5]),
 [Text(0.5, 0, 'Happiness Score'),
  Text(1.5, 0, 'Economy (GDP per Capita)'),
  Text(2.5, 0, 'Family'),
  Text(3.5, 0, 'Health (Life Expectancy)'),
  Text(4.5, 0, 'Freedom'),
  Text(5.5, 0, 'Trust (Government Corruption)'),
  Text(6.5, 0, 'Generosity'),
  Text(7.5, 0, 'Beer_PerCapita'),
  Text(8.5, 0, 'Spirit_PerCapita'),
  Text(9.5, 0, 'Wine_PerCapita')])


df_hap = dataset.iloc[:,[3,6,7,8,9,10,11,13,14,15]]


df_hap.head()


X = df_hap.iloc[:,1:]
y = df_hap['Happiness Score']
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
pd.DataFrame(X)


# To more easily refer to the columns in the scaled dataset we'll use these variables.
ECO,FAM,HEALTH,FREE,TRUST,GEN,BEER,SPIRIT,WINE = 0,1,2,3,4,5,6,7,8


X = df_hap.iloc[:,1:]
y = df_hap['Happiness Score']
Xcolumns = X.columns
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
pd.DataFrame(X)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)


ODS = LinearRegression()
ODS.fit(X_train, y_train)
print("Linear Regression: R^2 on train data is {} and on test data is {}".format(ODS.score(X_train, y_train), 
                                                              ODS.score(X_test,y_test)))

alphas = 10**np.linspace(-10, 10, 100)

ridgecv = RidgeCV(alphas = alphas)
ridgecv.fit(X_train, y_train)

print("Ridge: R^2 on train data is {} and on test data is {}".format(ridgecv.score(X_train, y_train), 
                                                              ridgecv.score(X_test,y_test)))

lassocv = LassoCV(alphas = None, cv = 10, max_iter = 100000)
lassocv.fit(X_train, y_train)

print("Lasso: R^2 on train data is {} and on test data is {}".format(lassocv.score(X_train, y_train), 
                                                              lassocv.score(X_test,y_test)))

e_netCV = ElasticNetCV(alphas = None, max_iter = 10000)
e_netCV.fit(X_train, y_train)

print("Elastic Net: R^2 on train data is {} and on test data is {}".format(e_netCV.score(X_train, y_train), 
                                                              e_netCV.score(X_test,y_test)))

regressor = DecisionTreeRegressor(random_state = 69)
regressor.fit(X_train, y_train)
print("Decision Tree: R^2 on train data is {} and on test data is {}".format(regressor.score(X_train, y_train), 
                                                              regressor.score(X_test,y_test)))


regressor_forest = RandomForestRegressor(random_state = 69)
regressor_forest.fit(X_train, y_train)
regressor_forest.score(X_test,y_test)
print("Random Forest: R^2 on train data is {} and on test data is {}".format(regressor_forest.score(X_train, y_train), 
                                                              regressor_forest.score(X_test,y_test)))


boost = GradientBoostingRegressor(random_state=69)
boost.fit(X_train,y_train)
print("Gradient: R^2 on train data is {} and on test data is {}".format(boost.score(X_train, y_train), 
                                                              boost.score(X_test,y_test)))

ada = AdaBoostRegressor(random_state=69)
ada.fit(X_train,y_train)
print("Ada Boost: R^2 on train data is {} and on test data is {}".format(ada.score(X_train, y_train), 
                                                              ada.score(X_test,y_test)))

Linear Regression: R^2 on train data is 0.8157205480939402 and on test data is 0.7391470750274364
Ridge: R^2 on train data is 0.809673655221876 and on test data is 0.7631040176342635
Lasso: R^2 on train data is 0.8157052280359876 and on test data is 0.7398946689393139
Elastic Net: R^2 on train data is 0.8141391042735131 and on test data is 0.7491176402712683
Decision Tree: R^2 on train data is 1.0 and on test data is 0.3604399134387075
Random Forest: R^2 on train data is 0.9623496313759795 and on test data is 0.7036447808712301
Gradient: R^2 on train data is 0.9991994231578427 and on test data is 0.7218707180251747
Ada Boost: R^2 on train data is 0.9328801636791149 and on test data is 0.7141860938231955


regressor_forest = RandomForestRegressor(random_state = 69, max_depth=2)
regressor_forest.fit(X_train, y_train)
regressor_forest.score(X_test,y_test)
print("Random Forest: R^2 on train data is {} and on test data is {}".format(regressor_forest.score(X_train, y_train), 
                                                              regressor_forest.score(X_test,y_test)))

boost = GradientBoostingRegressor(random_state=69, max_depth=1)
boost.fit(X_train,y_train)
print("Gradient: R^2 on train data is {} and on test data is {}".format(boost.score(X_train, y_train), 
                                                              boost.score(X_test,y_test)))

Random Forest: R^2 on train data is 0.8329602273895879 and on test data is 0.6623605120584957
Gradient: R^2 on train data is 0.9231823253002066 and on test data is 0.7372596902493713


def plot_feature_importances_sorted(model):
    features = np.array(Xcolumns)
    importances = model.feature_importances_
    sorted_idx = np.argsort(importances)
    padding = np.arange(len(features)) + 0.5
    plt.barh(padding, importances[sorted_idx], align='center')
    plt.yticks(padding, features[sorted_idx])
    plt.xlabel("Relative Importance")
    plt.title("Variable Importance")
    plt.show()


plot_feature_importances_sorted(regressor_forest)


# reset data
X = df_hap.iloc[:,1:]
y = df_hap['Happiness Score']
Xcolumns = X.columns
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
pd.DataFrame(X)

X = np.delete(X, WINE, 1)
X = np.delete(X, SPIRIT, 1)
y = df_hap['Happiness Score']

X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
regressor_forest = RandomForestRegressor(random_state = 69, max_depth=2)
regressor_forest.fit(X_train, y_train)
regressor_forest.score(X_test,y_test)
print("Random Forest: R^2 on train data is {} and on test data is {}".format(regressor_forest.score(X_train, y_train), 
                                                              regressor_forest.score(X_test,y_test)))

Random Forest: R^2 on train data is 0.8326492428156251 and on test data is 0.667166565435729


#should we scale here?
X = df_hap.iloc[:,1].values.reshape(-1,1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
ODS = LinearRegression()
ODS.fit(X_train, y_train)
print("Economy: R^2 on train data is {} and on test data is {}".format(ODS.score(X_train, y_train), 
                                                              ODS.score(X_test,y_test)))

X = df_hap.iloc[:,2].values.reshape(-1,1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
ODS = LinearRegression()
ODS.fit(X_train, y_train)
print("Family: R^2 on train data is {} and on test data is {}".format(ODS.score(X_train, y_train), 
                                                              ODS.score(X_test,y_test)))

X = df_hap.iloc[:,3].values.reshape(-1,1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
ODS = LinearRegression()
ODS.fit(X_train, y_train)
print("Health: R^2 on train data is {} and on test data is {}".format(ODS.score(X_train, y_train), 
                                                              ODS.score(X_test,y_test)))

X = df_hap.iloc[:,4].values.reshape(-1,1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
ODS = LinearRegression()
ODS.fit(X_train, y_train)
print("Freedom: R^2 on train data is {} and on test data is {}".format(ODS.score(X_train, y_train), 
                                                              ODS.score(X_test,y_test)))

X = df_hap.iloc[:,5].values.reshape(-1,1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
ODS = LinearRegression()
ODS.fit(X_train, y_train)
print("Trust: R^2 on train data is {} and on test data is {}".format(ODS.score(X_train, y_train), 
                                                              ODS.score(X_test,y_test)))

X = df_hap.iloc[:,6].values.reshape(-1,1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
ODS = LinearRegression()
ODS.fit(X_train, y_train)
print("Generosity: R^2 on train data is {} and on test data is {}".format(ODS.score(X_train, y_train), 
                                                              ODS.score(X_test,y_test)))

X = df_hap.iloc[:,7].values.reshape(-1,1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
ODS = LinearRegression()
ODS.fit(X_train, y_train)
print("Beer: R^2 on train data is {} and on test data is {}".format(ODS.score(X_train, y_train), 
                                                              ODS.score(X_test,y_test)))

X = df_hap.iloc[:,8].values.reshape(-1,1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
ODS = LinearRegression()
ODS.fit(X_train, y_train)
print("Spirit: R^2 on train data is {} and on test data is {}".format(ODS.score(X_train, y_train), 
                                                              ODS.score(X_test,y_test)))

X = df_hap.iloc[:,9].values.reshape(-1,1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
ODS = LinearRegression()
ODS.fit(X_train, y_train)
print("Wine: R^2 on train data is {} and on test data is {}".format(ODS.score(X_train, y_train), 
                                                              ODS.score(X_test,y_test)))

Economy: R^2 on train data is 0.6854382630802429 and on test data is 0.5663184780870714
Family: R^2 on train data is 0.526469021145125 and on test data is 0.5786411376823299
Health: R^2 on train data is 0.5265406846559759 and on test data is 0.5547078441889846
Freedom: R^2 on train data is 0.3125806992772868 and on test data is 0.4115933580570228
Trust: R^2 on train data is 0.24512367376479682 and on test data is 0.25333282518759226
Generosity: R^2 on train data is 0.07715095666439076 and on test data is 0.07544304195161933
Beer: R^2 on train data is 0.3929679893383089 and on test data is 0.049712140263640237
Spirit: R^2 on train data is 0.026623641872914994 and on test data is 0.03806296991174385
Wine: R^2 on train data is 0.1661412431313125 and on test data is 0.18887881399226503


# reset data
X = df_hap.iloc[:,1:]
y = df_hap['Happiness Score']
Xcolumns = X.columns
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
pd.DataFrame(X)


X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)

ridgecv = RidgeCV(alphas = alphas)
ridgecv.fit(X_train, y_train)

print("R^2 on train data is {} and on test data is {}".format(ridgecv.score(X_train, y_train), 
                                                              ridgecv.score(X_test,y_test)))

pd.Series(ridgecv.coef_, index=Xcolumns)

R^2 on train data is 0.809673655221876 and on test data is 0.7631040176342635

Economy (GDP per Capita)         0.348945
Family                           0.257445
Health (Life Expectancy)         0.171703
Freedom                          0.116715
Trust (Government Corruption)    0.156693
Generosity                       0.100187
Beer_PerCapita                   0.184716
Spirit_PerCapita                -0.026831
Wine_PerCapita                  -0.082030
dtype: float64


# reset data
X = df_hap.iloc[:,1:]
y = df_hap['Happiness Score']
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
pd.DataFrame(X)

X = np.delete(X, WINE, 1)
X = np.delete(X, SPIRIT, 1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
ridgecv = RidgeCV(alphas = alphas)
ridgecv.fit(X_train, y_train)

print("R^2 on train data is {} and on test data is {}".format(ridgecv.score(X_train, y_train), 
                                                              ridgecv.score(X_test,y_test)))

pd.Series(ridgecv.coef_, index=Xcolumns[0:7])

R^2 on train data is 0.7969862776947938 and on test data is 0.7668038411987816

Economy (GDP per Capita)         0.297983
Family                           0.244439
Health (Life Expectancy)         0.163010
Freedom                          0.135679
Trust (Government Corruption)    0.148611
Generosity                       0.090124
Beer_PerCapita                   0.163046
dtype: float64


# reset data
X = df_hap.iloc[:,1:]
y = df_hap['Happiness Score']
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
pd.DataFrame(X)

X = np.delete(X, WINE, 1)
X = np.delete(X, SPIRIT, 1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)


ODS = LinearRegression()
ODS.fit(X_train, y_train)
print("Linear Regression: R^2 on train data is {} and on test data is {}".format(ODS.score(X_train, y_train), 
                                                              ODS.score(X_test,y_test)))

alphas = 10**np.linspace(-10, 10, 100)

ridgecv = RidgeCV(alphas = alphas)
ridgecv.fit(X_train, y_train)

print("Ridge: R^2 on train data is {} and on test data is {}".format(ridgecv.score(X_train, y_train), 
                                                              ridgecv.score(X_test,y_test)))

lassocv = LassoCV(alphas = None, cv = 10, max_iter = 100000)
lassocv.fit(X_train, y_train)

print("Lasso: R^2 on train data is {} and on test data is {}".format(lassocv.score(X_train, y_train), 
                                                              lassocv.score(X_test,y_test)))

e_netCV = ElasticNetCV(alphas = None, max_iter = 10000)
e_netCV.fit(X_train, y_train)

print("Elastic Net: R^2 on train data is {} and on test data is {}".format(e_netCV.score(X_train, y_train), 
                                                              e_netCV.score(X_test,y_test)))

regressor = DecisionTreeRegressor(random_state = 69)
regressor.fit(X_train, y_train)
print("Decision Tree: R^2 on train data is {} and on test data is {}".format(regressor.score(X_train, y_train), 
                                                              regressor.score(X_test,y_test)))


regressor_forest = RandomForestRegressor(random_state = 69, max_depth=2)
regressor_forest.fit(X_train, y_train)
regressor_forest.score(X_test,y_test)
print("Random Forest: R^2 on train data is {} and on test data is {}".format(regressor_forest.score(X_train, y_train), 
                                                              regressor_forest.score(X_test,y_test)))


boost = GradientBoostingRegressor(random_state=69, max_depth=1)
boost.fit(X_train,y_train)
print("Gradient: R^2 on train data is {} and on test data is {}".format(boost.score(X_train, y_train), 
                                                              boost.score(X_test,y_test)))

ada = AdaBoostRegressor(random_state=69)
ada.fit(X_train,y_train)
print("Ada Boost: R^2 on train data is {} and on test data is {}".format(ada.score(X_train, y_train), 
                                                              ada.score(X_test,y_test)))

Linear Regression: R^2 on train data is 0.8032210045447299 and on test data is 0.7590406179801248
Ridge: R^2 on train data is 0.7969862776947938 and on test data is 0.7668038411987816
Lasso: R^2 on train data is 0.8032070467578001 and on test data is 0.7588620463923423
Elastic Net: R^2 on train data is 0.8029405011774073 and on test data is 0.7603046550881256
Decision Tree: R^2 on train data is 1.0 and on test data is 0.47246494234347336
Random Forest: R^2 on train data is 0.8326492428156251 and on test data is 0.667166565435729
Gradient: R^2 on train data is 0.9194812328880763 and on test data is 0.7371589378156305
Ada Boost: R^2 on train data is 0.9436833257711875 and on test data is 0.6828886263073815


# reset data
X = df_hap.iloc[:,1:]
y = df_hap['Happiness Score']
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
pd.DataFrame(X)

X = np.delete(X, WINE, 1)
X = np.delete(X, SPIRIT, 1)
y = df_hap['Happiness Score']
X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=69)
ridgecv = RidgeCV(alphas = alphas)
ridgecv.fit(X_train, y_train)

print("R^2 on train data is {} and on test data is {}".format(ridgecv.score(X_train, y_train), 
                                                              ridgecv.score(X_test,y_test)))

pd.Series(ridgecv.coef_, index=Xcolumns[0:7])

R^2 on train data is 0.7969862776947938 and on test data is 0.7668038411987816

Economy (GDP per Capita)         0.297983
Family                           0.244439
Health (Life Expectancy)         0.163010
Freedom                          0.135679
Trust (Government Corruption)    0.148611
Generosity                       0.090124
Beer_PerCapita                   0.163046
dtype: float64


fig = plt.figure(figsize=(15, 5))
ax = fig.add_axes([0,0,1,1])
ax.bar(Xcolumns[0:7], list(ridgecv.coef_) )
plt.show()

	Country	Region	Happiness Rank	Happiness Score	Lower Confidence Interval	Upper Confidence Interval	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual
0	Denmark	Western Europe	1	7.526	7.460	7.592	1.44178	1.16374	0.79504	0.57941	0.44453	0.36171	2.73939
1	Switzerland	Western Europe	2	7.509	7.428	7.590	1.52733	1.14524	0.86303	0.58557	0.41203	0.28083	2.69463
2	Iceland	Western Europe	3	7.501	7.333	7.669	1.42666	1.18326	0.86733	0.56624	0.14975	0.47678	2.83137
3	Norway	Western Europe	4	7.498	7.421	7.575	1.57744	1.12690	0.79579	0.59609	0.35776	0.37895	2.66465
4	Finland	Western Europe	5	7.413	7.351	7.475	1.40598	1.13464	0.81091	0.57104	0.41004	0.25492	2.82596

	Country	Region	Happiness Rank	Happiness Score	Lower Confidence Interval	Upper Confidence Interval	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual	Beer_PerCapita	Spirit_PerCapita	Wine_PerCapita
0	Denmark	Western Europe	1	7.526	7.460	7.592	1.44178	1.16374	0.79504	0.57941	0.44453	0.36171	2.73939	224	81	278
1	Switzerland	Western Europe	2	7.509	7.428	7.590	1.52733	1.14524	0.86303	0.58557	0.41203	0.28083	2.69463	185	100	280
2	Iceland	Western Europe	3	7.501	7.333	7.669	1.42666	1.18326	0.86733	0.56624	0.14975	0.47678	2.83137	233	61	78
3	Norway	Western Europe	4	7.498	7.421	7.575	1.57744	1.12690	0.79579	0.59609	0.35776	0.37895	2.66465	169	71	129
4	Finland	Western Europe	5	7.413	7.351	7.475	1.40598	1.13464	0.81091	0.57104	0.41004	0.25492	2.82596	263	133	97

	Happiness Score	Lower Confidence Interval	Upper Confidence Interval	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual	Beer_PerCapita	Spirit_PerCapita	Wine_PerCapita
count	118.000000	118.000000	118.000000	118.000000	118.000000	118.000000	118.000000	118.000000	118.000000	118.000000	118.000000	118.000000	118.000000
mean	5.557195	5.459975	5.654415	1.021216	0.840081	0.594344	0.383215	0.134977	0.229399	2.353952	138.906780	97.067797	68.093220
std	1.145853	1.153111	1.139521	0.381773	0.251482	0.222809	0.145206	0.114400	0.125321	0.519255	105.090599	78.595658	89.066189
min	3.069000	2.936000	3.202000	0.087090	0.000000	0.000000	0.005890	0.000000	0.000000	0.817890	1.000000	1.000000	1.000000
25%	4.640000	4.515250	4.767250	0.817762	0.708878	0.478178	0.275750	0.052440	0.140220	2.090750	42.000000	32.000000	5.000000
50%	5.553000	5.469500	5.677500	1.088420	0.877970	0.643315	0.404725	0.098535	0.211335	2.300940	127.500000	85.500000	16.000000
75%	6.480250	6.396750	6.570750	1.308817	1.041250	0.775190	0.502933	0.177445	0.303113	2.694330	224.750000	142.500000	115.250000
max	7.526000	7.460000	7.669000	1.824270	1.183260	0.947190	0.608480	0.480490	0.586960	3.559060	376.000000	373.000000	370.000000

	Country	Region	Happiness Rank	Happiness Score	Lower Confidence Interval	Upper Confidence Interval	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual	Beer_PerCapita	Spirit_PerCapita	Wine_PerCapita
97	Georgia	Central and Eastern Europe	126	4.252	4.164	4.340	0.83792	0.19249	0.64035	0.32461	0.31880	0.06786	1.87031	52	100	149
101	Malawi	Sub-Saharan Africa	132	4.156	4.041	4.271	0.08709	0.14700	0.29364	0.41430	0.07564	0.30968	2.82859	8	11	1
115	Benin	Sub-Saharan Africa	153	3.484	3.404	3.564	0.39499	0.10419	0.21028	0.39747	0.06681	0.20180	2.10812	34	4	13
116	Togo	Sub-Saharan Africa	155	3.303	3.192	3.414	0.28123	0.00000	0.24811	0.34678	0.11587	0.17517	2.13540	36	2	19
117	Syria	Middle East and Northern Africa	156	3.069	2.936	3.202	0.74719	0.14866	0.62994	0.06912	0.17233	0.48397	0.81789	5	35	16

	Country	Region	Happiness Rank	Happiness Score	Lower Confidence Interval	Upper Confidence Interval	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual	Beer_PerCapita	Spirit_PerCapita	Wine_PerCapita
0	Denmark	Western Europe	1	7.526	7.460	7.592	1.44178	1.16374	0.79504	0.57941	0.44453	0.36171	2.73939	224	81	278
1	Switzerland	Western Europe	2	7.509	7.428	7.590	1.52733	1.14524	0.86303	0.58557	0.41203	0.28083	2.69463	185	100	280
4	Finland	Western Europe	5	7.413	7.351	7.475	1.40598	1.13464	0.81091	0.57104	0.41004	0.25492	2.82596	263	133	97
7	New Zealand	Australia and New Zealand	8	7.334	7.264	7.404	1.36066	1.17278	0.83096	0.58147	0.41904	0.49401	2.47553	203	79	175
9	Sweden	Western Europe	10	7.291	7.227	7.355	1.45181	1.08764	0.83121	0.58218	0.40867	0.38254	2.54734	152	60	186
20	Singapore	Southeastern Asia	22	6.739	6.674	6.804	1.64555	0.86758	0.94719	0.48770	0.46987	0.32706	1.99375	60	12	11
32	Qatar	Middle East and Northern Africa	36	6.375	6.178	6.572	1.82427	0.87964	0.71723	0.56679	0.48049	0.32388	1.58224	1	42	7

7. Group Assignment & Presentation¶

1. Analysis: Frame the problem and look at the big picture¶

2. Get the data¶

3. Explore the data¶

1 Creating copy¶

2 Jupyter Notebook¶

3 Features description¶

4 Identify¶

5 Visualize¶

world map of beer, wine, spirit https://www.kaggle.com/code/lily1917/happiness-alcohol-world-heat-map-eda ¶

6 Correlation between features¶

7 Identify transformations¶

8 Document¶

4. Prepare the data¶

5. Short-list promising models¶

6. Fine-tune the system¶

7. Present your solution¶

	Country	Region	Happiness Rank	Happiness Score	Lower Confidence Interval	Upper Confidence Interval	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual	Beer_PerCapita	Spirit_PerCapita	Wine_PerCapita
28	Malta	Western Europe	30	6.488	6.409	6.567	1.30782	1.09879	0.80315	0.54994	0.17554	0.56237	1.99032	149	100	120
31	Thailand	Southeastern Asia	33	6.474	6.396	6.552	1.08930	1.04477	0.64915	0.49553	0.02833	0.58696	2.57960	99	258	1

	Country	Region	Happiness Rank	Happiness Score	Lower Confidence Interval	Upper Confidence Interval	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual	Beer_PerCapita	Spirit_PerCapita	Wine_PerCapita
53	Belarus	Central and Eastern Europe	61	5.802	5.723	5.881	1.13062	1.04993	0.63104	0.29091	0.17457	0.13942	2.38582	142	373	42
104	Haiti	Latin America and Caribbean	136	4.028	3.893	4.163	0.34097	0.29561	0.27494	0.12072	0.14476	0.47958	2.37116	1	326	1

	Country	Region	Happiness Rank	Happiness Score	Lower Confidence Interval	Upper Confidence Interval	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual	Beer_PerCapita	Spirit_PerCapita	Wine_PerCapita
30	France	Western Europe	32	6.478	6.397	6.559	1.39488	1.00508	0.83795	0.46562	0.17808	0.12160	2.47440	127	151	370
78	Portugal	Western Europe	94	5.123	5.030	5.216	1.27607	0.94367	0.79363	0.44727	0.01521	0.11691	1.53015	194	67	339

	0	1	2	3	4	5	6	7	8
0	1.106305	1.292497	0.904595	1.356911	2.717426	1.060275	0.813166	-0.205308	2.366801
1	1.331347	1.218619	1.211045	1.399515	2.432123	0.412142	0.440475	0.037467	2.389352
2	1.066532	1.370448	1.230427	1.265826	0.129687	1.982390	0.899172	-0.460860	0.111704
3	1.463163	1.145381	0.907975	1.472272	1.955712	1.198428	0.287576	-0.333084	0.686753
4	1.012132	1.176289	0.976126	1.299023	2.414654	0.204512	1.185857	0.459128	0.325938
...	...	...	...	...	...	...	...	...	...
113	-1.445914	-0.254981	-1.069776	-0.453799	-0.737281	0.683720	-0.983395	-1.163628	-0.756509
114	-2.404715	-1.343978	-1.634764	-0.869597	-0.758964	0.089999	-1.145850	0.701902	-0.745233
115	-1.647307	-2.938698	-1.731085	0.098592	-0.598405	-0.221166	-1.002508	-1.189184	-0.621203
116	-1.946556	-3.354769	-1.560574	-0.251987	-0.167730	-0.434566	-0.983395	-1.214739	-0.553550
117	-0.720834	-2.761112	0.160442	-2.172318	0.327907	2.040007	-1.279637	-0.793078	-0.587376

Feature	Train score	Test score
Economy	0.68544	0.56632
Family	0.52647	0.57864
Health	0.52654	0.55471
Freedom	0.31258	0.41159
Trust	0.24512	0.25333
Generosity	0.07715	0.07544
Beer	0.39297	0.04971
Spirit	0.02662	0.03806
Wine	0.16614	0.18888

Model	Train score	Test score
Linear Regression	0.80322	0.75904
Ridge	0.79699	0.76680
Lasso	0.80321	0.75886
Elastic Net	0.80294	0.76030
Decision Tree	1.00000	0.47247
Random Forest	0.83265	0.66717
Gradient	0.91948	0.73716
Ada Boost	0.94368	0.68289

	Country	Region	Happiness Rank	Happiness Score	Lower Confidence Interval	Upper Confidence Interval	Economy (GDP per Capita)	Family	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Dystopia Residual	Beer_PerCapita	Spirit_PerCapita	Wine_PerCapita
31	Thailand	Southeastern Asia	33	6.474	6.396	6.552	1.08930	1.04477	0.64915	0.49553	0.02833	0.58696	2.57960	99	258	1
34	Guatemala	Latin America and Caribbean	39	6.324	6.213	6.435	0.83454	0.87119	0.54039	0.50379	0.08701	0.28808	3.19863	53	69	2
40	El Salvador	Latin America and Caribbean	46	6.068	5.967	6.169	0.87370	0.80975	0.59600	0.37269	0.10613	0.08877	3.22134	52	69	2
41	Nicaragua	Latin America and Caribbean	48	5.992	5.877	6.107	0.69384	0.89521	0.65213	0.46582	0.16292	0.29773	2.82428	78	118	1
66	Jordan	Middle East and Northern Africa	80	5.303	5.187	5.419	0.99673	0.86216	0.60712	0.36023	0.13297	0.14262	2.20142	6	21	1
68	Philippines	Southeastern Asia	82	5.279	5.160	5.398	0.81217	0.87877	0.47036	0.54854	0.11757	0.21674	2.23484	71	186	1
80	Vietnam	Southeastern Asia	96	5.061	4.991	5.131	0.74037	0.79117	0.66157	0.55954	0.11556	0.25075	1.94180	111	2	1
84	Nigeria	Sub-Saharan Africa	103	4.875	4.750	5.000	0.75216	0.64498	0.05108	0.27854	0.03050	0.23219	2.88586	42	5	2
85	Honduras	Latin America and Caribbean	104	4.871	4.750	4.992	0.69429	0.75596	0.58383	0.26755	0.06906	0.20440	2.29551	69	98	2
88	Sierra Leone	Sub-Saharan Africa	111	4.635	4.505	4.765	0.36485	0.62800	0.00000	0.30685	0.08196	0.23897	3.01402	25	3	2
89	Namibia	Sub-Saharan Africa	113	4.574	4.374	4.774	0.93287	0.70362	0.34745	0.48614	0.10398	0.07795	1.92198	376	3	1
92	Egypt	Middle East and Northern Africa	120	4.362	4.259	4.465	0.95395	0.49813	0.52116	0.18847	0.10393	0.12706	1.96895	6	4	1
94	Kenya	Sub-Saharan Africa	122	4.356	4.259	4.453	0.52267	0.76240	0.30147	0.40576	0.06686	0.41328	1.88326	58	22	2
101	Malawi	Sub-Saharan Africa	132	4.156	4.041	4.271	0.08709	0.14700	0.29364	0.41430	0.07564	0.30968	2.82859	8	11	1
103	Mali	Sub-Saharan Africa	135	4.073	3.988	4.158	0.31292	0.86333	0.16347	0.27544	0.13647	0.21064	2.11087	5	1	1
104	Haiti	Latin America and Caribbean	136	4.028	3.893	4.163	0.34097	0.29561	0.27494	0.12072	0.14476	0.47958	2.37116	1	326	1
106	Comoros	Sub-Saharan Africa	138	3.956	3.860	4.052	0.27509	0.60323	0.29981	0.15412	0.18437	0.18270	2.25632	1	3	1
107	Cambodia	Southeastern Asia	140	3.907	3.798	4.016	0.55604	0.53750	0.42494	0.58852	0.08092	0.40339	1.31573	57	65	1
109	Niger	Sub-Saharan Africa	142	3.856	3.781	3.931	0.13270	0.60530	0.26162	0.38041	0.17176	0.20970	2.09469	3	2	1
110	Chad	Sub-Saharan Africa	144	3.763	3.672	3.854	0.42214	0.63178	0.03824	0.12807	0.04952	0.18667	2.30637	15	1	1
113	Tanzania	Sub-Saharan Africa	149	3.666	3.561	3.771	0.47155	0.77623	0.35700	0.31760	0.05099	0.31472	1.37769	36	6	1
114	Liberia	Sub-Saharan Africa	150	3.622	3.463	3.781	0.10706	0.50353	0.23165	0.25748	0.04852	0.24063	2.23284	19	152	2

7. Group Assignment & Presentation¶

1. Analysis: Frame the problem and look at the big picture¶

2. Get the data¶

3. Explore the data¶

1 Creating copy¶

2 Jupyter Notebook¶

3 Features description¶

4 Identify¶

5 Visualize¶

world map of beer, wine, spirit https://www.kaggle.com/code/lily1917/happiness-alcohol-world-heat-map-eda¶

6 Correlation between features¶

7 Identify transformations¶

8 Document¶

4. Prepare the data¶

5. Short-list promising models¶

6. Fine-tune the system¶

7. Present your solution¶

world map of beer, wine, spirit https://www.kaggle.com/code/lily1917/happiness-alcohol-world-heat-map-eda ¶