Calculate correlation and p-Value between two lists

The two most commonly used statistical tests for establishing relationship between variables are correlation and p-value. Correlation is a way to test if two variables have any kind of relationship, whereas p-value tells us if the result of an experiment is statistically significant.

#load  data frame from the csv file

import pandas as pd
df = pd.read_csv("nba.csv")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats


def plot_cor_matrix(corr, mask=None):
    f, ax = plt.subplots(figsize=(11, 9))
    sns.heatmap(corr, ax=ax,
                mask=mask,
                # cosmetics
                annot=True, vmin=-1, vmax=1, center=0,
                cmap='coolwarm', linewidths=2, linecolor='black', cbar_kws={'orientation': 'horizontal'})

def corr_sig(df=None):
    p_matrix = np.zeros(shape=(df.shape[1],df.shape[1]))
    for col in df.columns:
        for col2 in df.drop(col,axis=1).columns:
            _ , p = stats.pearsonr(df[col],df[col2])
            p_matrix[df.columns.to_list().index(col),df.columns.to_list().index(col2)] = p
    return p_matrix


res_age = []
res_Weight = []
for i, c in zip(df['Age'], df['Weight']):
    res_age.append(i)
    res_Weight.append(c)


df_new = pd.DataFrame(res_age, columns=['Age'])
df_new['Weight'] = res_Weight

corr = df_new.corr()
mask = np.triu(corr)
plot_cor_matrix(corr,mask)
plt.show()

corr = df_new.corr()                            # get correlation
p_values = corr_sig(df_new)                     # get p-Value
mask = np.invert(np.tril(p_values<0.05))    # mask - only get significant corr
plot_cor_matrix(corr,mask) 

Leave a Reply

Your email address will not be published. Required fields are marked *

Proudly powered by WordPress | Theme: Rits Blog by Crimson Themes.