I don’t know how to compute the average r squared with individual stock return and market return
import numpy as np import pandas as pd from sklearn import datasets, linear_model from sklearn.linear_model import LinearRegression lm = LinearRegression() df1 = pd.read_csv(r'C:UsersUSERDesktop股價資料.csv') pd.DataFrame(df1) model = LinearRegression() x1 = df1[['NAME OF COMPANY']] y1 = df1[['INDIVIDUAL COMPANY RETURN']] y2= df1[['MARKET RETURN']] z1= df1[['YEAR']] # print(x1,z1) list1=[] list2=[] list3=[] list4=[] i=0 for i in range(379539): #total number of company's data is 379539 if x1.values[i]!=x1.values[i+1] or z1.values[i]!=z1.values[i+1]: list1.append(x1.index[i+1]) y1.values[list1[i]:list1[i+1]] # lm.fit(list3, list4) # r_squared = lm.score(list3,list4) # list2.append(r_squared) # lm.fit(list3, list4) # r_squared = lm.score(list3,list4) # # print(r_squared)
This is what my code looks like now, I’ve tried to separate data of y1,y2 to run the r square with the value append in list1
Advertisement
Answer
Formula of R squared:
1-unexplained_variation/total_variation
Unexplained variation
is the sum of difference for each datapoint between the prediction using the line of best fit and the actual values. You can compute the coefficients of the line of best fit with numpy.polyfit()
.
Total variation
is the sum of difference for each datapoint between the average value and the actual values.
EDIT: With dummy values, it would look something like this
import numpy as np x = [2000, 2001, 2002, 2003, 2004] y = [50000, 10000, 20000, 30000, 5000] def get_unexplained_variation(xs, ys): a,b = np.polyfit(xs, ys, 1) var = 0 for x,y in zip(xs, ys): var += (a*x+b - y)**2 return var def get_total_variation(xs, ys): avg = np.mean(ys) var = 0 for y in ys: var += (y - avg) ** 2 return var unexplained_variation = get_unexplained_variation(x, y) total_variation = get_total_variation(x, y) print(1 - unexplained_variation/total_variation)