Interactions and ANOVA

Note: This script is based heavily on Jonathan Taylor's class notes http://www.stanford.edu/class/stats191/interactions.html

Download and format data:

In [1]:
%matplotlib inline

from __future__ import print_function
from statsmodels.compat import urlopen
import numpy as np
np.set_printoptions(precision=4, suppress=True)
import statsmodels.api as sm
import pandas as pd
pd.set_option("display.width", 100)
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot
from statsmodels.stats.anova import anova_lm

try:
    salary_table = pd.read_csv('salary.table')
except:  # recent pandas can read URL without urlopen
    url = 'http://stats191.stanford.edu/data/salary.table'
    fh = urlopen(url)
    salary_table = pd.read_table(fh)
    salary_table.to_csv('salary.table')

E = salary_table.E
M = salary_table.M
X = salary_table.X
S = salary_table.S
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-1-8c4c3a146aa8> in <module>()
     15 try:
---> 16     salary_table = pd.read_csv('salary.table')
     17 except:  # recent pandas can read URL without urlopen

/usr/lib/python3/dist-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    645 
--> 646         return _read(filepath_or_buffer, kwds)
    647 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    388     # Create the parser.
--> 389     parser = TextFileReader(filepath_or_buffer, **kwds)
    390 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    729 
--> 730         self._make_engine(self.engine)
    731 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in _make_engine(self, engine)
    922         if engine == 'c':
--> 923             self._engine = CParserWrapper(self.f, **self.options)
    924         else:

/usr/lib/python3/dist-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1389 
-> 1390         self._reader = _parser.TextReader(src, **kwds)
   1391 

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4184)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:8449)()

FileNotFoundError: File b'salary.table' does not exist

During handling of the above exception, another exception occurred:

ConnectionRefusedError                    Traceback (most recent call last)
/usr/lib/python3.5/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1253             try:
-> 1254                 h.request(req.get_method(), req.selector, req.data, headers)
   1255             except OSError as err: # timeout error

/usr/lib/python3.5/http/client.py in request(self, method, url, body, headers)
   1106         """Send a complete request to the server."""
-> 1107         self._send_request(method, url, body, headers)
   1108 

/usr/lib/python3.5/http/client.py in _send_request(self, method, url, body, headers)
   1151             body = _encode(body, 'body')
-> 1152         self.endheaders(body)
   1153 

/usr/lib/python3.5/http/client.py in endheaders(self, message_body)
   1102             raise CannotSendHeader()
-> 1103         self._send_output(message_body)
   1104 

/usr/lib/python3.5/http/client.py in _send_output(self, message_body)
    933 
--> 934         self.send(msg)
    935         if message_body is not None:

/usr/lib/python3.5/http/client.py in send(self, data)
    876             if self.auto_open:
--> 877                 self.connect()
    878             else:

/usr/lib/python3.5/http/client.py in connect(self)
    848         self.sock = self._create_connection(
--> 849             (self.host,self.port), self.timeout, self.source_address)
    850         self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)

/usr/lib/python3.5/socket.py in create_connection(address, timeout, source_address)
    711     if err is not None:
--> 712         raise err
    713     else:

/usr/lib/python3.5/socket.py in create_connection(address, timeout, source_address)
    702                 sock.bind(source_address)
--> 703             sock.connect(sa)
    704             return sock

ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-1-8c4c3a146aa8> in <module>()
     17 except:  # recent pandas can read URL without urlopen
     18     url = 'http://stats191.stanford.edu/data/salary.table'
---> 19     fh = urlopen(url)
     20     salary_table = pd.read_table(fh)
     21     salary_table.to_csv('salary.table')

/usr/lib/python3.5/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    161     else:
    162         opener = _opener
--> 163     return opener.open(url, data, timeout)
    164 
    165 def install_opener(opener):

/usr/lib/python3.5/urllib/request.py in open(self, fullurl, data, timeout)
    464             req = meth(req)
    465 
--> 466         response = self._open(req, data)
    467 
    468         # post-process response

/usr/lib/python3.5/urllib/request.py in _open(self, req, data)
    482         protocol = req.type
    483         result = self._call_chain(self.handle_open, protocol, protocol +
--> 484                                   '_open', req)
    485         if result:
    486             return result

/usr/lib/python3.5/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    442         for handler in handlers:
    443             func = getattr(handler, meth_name)
--> 444             result = func(*args)
    445             if result is not None:
    446                 return result

/usr/lib/python3.5/urllib/request.py in http_open(self, req)
   1280 
   1281     def http_open(self, req):
-> 1282         return self.do_open(http.client.HTTPConnection, req)
   1283 
   1284     http_request = AbstractHTTPHandler.do_request_

/usr/lib/python3.5/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1254                 h.request(req.get_method(), req.selector, req.data, headers)
   1255             except OSError as err: # timeout error
-> 1256                 raise URLError(err)
   1257             r = h.getresponse()
   1258         except:

URLError: <urlopen error [Errno 111] Connection refused>

Take a look at the data:

In [2]:
plt.figure(figsize=(6,6))
symbols = ['D', '^']
colors = ['r', 'g', 'blue']
factor_groups = salary_table.groupby(['E','M'])
for values, group in factor_groups:
    i,j = values
    plt.scatter(group['X'], group['S'], marker=symbols[j], color=colors[i-1],
               s=144)
plt.xlabel('Experience');
plt.ylabel('Salary');
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-66af8762e78f> in <module>()
      2 symbols = ['D', '^']
      3 colors = ['r', 'g', 'blue']
----> 4 factor_groups = salary_table.groupby(['E','M'])
      5 for values, group in factor_groups:
      6     i,j = values

NameError: name 'salary_table' is not defined
<matplotlib.figure.Figure at 0x7fb52c1ee898>

Fit a linear model:

In [3]:
formula = 'S ~ C(E) + C(M) + X'
lm = ols(formula, salary_table).fit()
print(lm.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-95ded19c3a72> in <module>()
      1 formula = 'S ~ C(E) + C(M) + X'
----> 2 lm = ols(formula, salary_table).fit()
      3 print(lm.summary())

NameError: name 'salary_table' is not defined

Have a look at the created design matrix:

In [4]:
lm.model.exog[:5]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-e9012a047d1f> in <module>()
----> 1 lm.model.exog[:5]

NameError: name 'lm' is not defined

Or since we initially passed in a DataFrame, we have a DataFrame available in

In [5]:
lm.model.data.orig_exog[:5]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-0d890acdc576> in <module>()
----> 1 lm.model.data.orig_exog[:5]

NameError: name 'lm' is not defined

We keep a reference to the original untouched data in

In [6]:
lm.model.data.frame[:5]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-7c0e717ac664> in <module>()
----> 1 lm.model.data.frame[:5]

NameError: name 'lm' is not defined

Influence statistics

In [7]:
infl = lm.get_influence()
print(infl.summary_table())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-a1537ce8cfa3> in <module>()
----> 1 infl = lm.get_influence()
      2 print(infl.summary_table())

NameError: name 'lm' is not defined

or get a dataframe

In [8]:
df_infl = infl.summary_frame()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-f4ff1167f21e> in <module>()
----> 1 df_infl = infl.summary_frame()

NameError: name 'infl' is not defined
In [9]:
df_infl[:5]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-1fca7cb59e99> in <module>()
----> 1 df_infl[:5]

NameError: name 'df_infl' is not defined

Now plot the reiduals within the groups separately:

In [10]:
resid = lm.resid
plt.figure(figsize=(6,6));
for values, group in factor_groups:
    i,j = values
    group_num = i*2 + j - 1  # for plotting purposes
    x = [group_num] * len(group)
    plt.scatter(x, resid[group.index], marker=symbols[j], color=colors[i-1],
            s=144, edgecolors='black')
plt.xlabel('Group');
plt.ylabel('Residuals');
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-1f49e6986775> in <module>()
----> 1 resid = lm.resid
      2 plt.figure(figsize=(6,6));
      3 for values, group in factor_groups:
      4     i,j = values
      5     group_num = i*2 + j - 1  # for plotting purposes

NameError: name 'lm' is not defined

Now we will test some interactions using anova or f_test

In [11]:
interX_lm = ols("S ~ C(E) * X + C(M)", salary_table).fit()
print(interX_lm.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-1875cf9774f0> in <module>()
----> 1 interX_lm = ols("S ~ C(E) * X + C(M)", salary_table).fit()
      2 print(interX_lm.summary())

NameError: name 'salary_table' is not defined

Do an ANOVA check

In [12]:
from statsmodels.stats.api import anova_lm

table1 = anova_lm(lm, interX_lm)
print(table1)

interM_lm = ols("S ~ X + C(E)*C(M)", data=salary_table).fit()
print(interM_lm.summary())

table2 = anova_lm(lm, interM_lm)
print(table2)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-0c9af2b17bde> in <module>()
      1 from statsmodels.stats.api import anova_lm
      2 
----> 3 table1 = anova_lm(lm, interX_lm)
      4 print(table1)
      5 

NameError: name 'lm' is not defined

The design matrix as a DataFrame

In [13]:
interM_lm.model.data.orig_exog[:5]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-54036886e7c2> in <module>()
----> 1 interM_lm.model.data.orig_exog[:5]

NameError: name 'interM_lm' is not defined

The design matrix as an ndarray

In [14]:
interM_lm.model.exog
interM_lm.model.exog_names
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-f8ee67f5a751> in <module>()
----> 1 interM_lm.model.exog
      2 interM_lm.model.exog_names

NameError: name 'interM_lm' is not defined
In [15]:
infl = interM_lm.get_influence()
resid = infl.resid_studentized_internal
plt.figure(figsize=(6,6))
for values, group in factor_groups:
    i,j = values
    idx = group.index
    plt.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1],
            s=144, edgecolors='black')
plt.xlabel('X');
plt.ylabel('standardized resids');
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-b9ade982a4ac> in <module>()
----> 1 infl = interM_lm.get_influence()
      2 resid = infl.resid_studentized_internal
      3 plt.figure(figsize=(6,6))
      4 for values, group in factor_groups:
      5     i,j = values

NameError: name 'interM_lm' is not defined

Looks like one observation is an outlier.

In [16]:
drop_idx = abs(resid).argmax()
print(drop_idx)  # zero-based index
idx = salary_table.index.drop(drop_idx)

lm32 = ols('S ~ C(E) + X + C(M)', data=salary_table, subset=idx).fit()

print(lm32.summary())
print('\n')

interX_lm32 = ols('S ~ C(E) * X + C(M)', data=salary_table, subset=idx).fit()

print(interX_lm32.summary())
print('\n')


table3 = anova_lm(lm32, interX_lm32)
print(table3)
print('\n')


interM_lm32 = ols('S ~ X + C(E) * C(M)', data=salary_table, subset=idx).fit()

table4 = anova_lm(lm32, interM_lm32)
print(table4)
print('\n')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-a8a74da84db4> in <module>()
----> 1 drop_idx = abs(resid).argmax()
      2 print(drop_idx)  # zero-based index
      3 idx = salary_table.index.drop(drop_idx)
      4 
      5 lm32 = ols('S ~ C(E) + X + C(M)', data=salary_table, subset=idx).fit()

NameError: name 'resid' is not defined

Replot the residuals

In [17]:
try:
    resid = interM_lm32.get_influence().summary_frame()['standard_resid']
except:
    resid = interM_lm32.get_influence().summary_frame()['standard_resid']

plt.figure(figsize=(6,6))
for values, group in factor_groups:
    i,j = values
    idx = group.index
    plt.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1],
            s=144, edgecolors='black')
plt.xlabel('X[~[32]]');
plt.ylabel('standardized resids');
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-17-b457d7767b76> in <module>()
      1 try:
----> 2     resid = interM_lm32.get_influence().summary_frame()['standard_resid']
      3 except:

NameError: name 'interM_lm32' is not defined

During handling of the above exception, another exception occurred:

NameError                                 Traceback (most recent call last)
<ipython-input-17-b457d7767b76> in <module>()
      2     resid = interM_lm32.get_influence().summary_frame()['standard_resid']
      3 except:
----> 4     resid = interM_lm32.get_influence().summary_frame()['standard_resid']
      5 
      6 plt.figure(figsize=(6,6))

NameError: name 'interM_lm32' is not defined

Plot the fitted values

In [18]:
lm_final = ols('S ~ X + C(E)*C(M)', data = salary_table.drop([drop_idx])).fit()
mf = lm_final.model.data.orig_exog
lstyle = ['-','--']

plt.figure(figsize=(6,6))
for values, group in factor_groups:
    i,j = values
    idx = group.index
    plt.scatter(X[idx], S[idx], marker=symbols[j], color=colors[i-1],
                s=144, edgecolors='black')
    # drop NA because there is no idx 32 in the final model
    plt.plot(mf.X[idx].dropna(), lm_final.fittedvalues[idx].dropna(),
            ls=lstyle[j], color=colors[i-1])
plt.xlabel('Experience');
plt.ylabel('Salary');
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-25e9e43f896a> in <module>()
----> 1 lm_final = ols('S ~ X + C(E)*C(M)', data = salary_table.drop([drop_idx])).fit()
      2 mf = lm_final.model.data.orig_exog
      3 lstyle = ['-','--']
      4 
      5 plt.figure(figsize=(6,6))

NameError: name 'salary_table' is not defined

From our first look at the data, the difference between Master's and PhD in the management group is different than in the non-management group. This is an interaction between the two qualitative variables management,M and education,E. We can visualize this by first removing the effect of experience, then plotting the means within each of the 6 groups using interaction.plot.

In [19]:
U = S - X * interX_lm32.params['X']

plt.figure(figsize=(6,6))
interaction_plot(E, M, U, colors=['red','blue'], markers=['^','D'],
        markersize=10, ax=plt.gca())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-1bfcb96c9336> in <module>()
----> 1 U = S - X * interX_lm32.params['X']
      2 
      3 plt.figure(figsize=(6,6))
      4 interaction_plot(E, M, U, colors=['red','blue'], markers=['^','D'],
      5         markersize=10, ax=plt.gca())

NameError: name 'S' is not defined

Minority Employment Data

In [20]:
try:
    jobtest_table = pd.read_table('jobtest.table')
except:  # don't have data already
    url = 'http://stats191.stanford.edu/data/jobtest.table'
    jobtest_table = pd.read_table(url)

factor_group = jobtest_table.groupby(['ETHN'])

fig, ax = plt.subplots(figsize=(6,6))
colors = ['purple', 'green']
markers = ['o', 'v']
for factor, group in factor_group:
    ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
                marker=markers[factor], s=12**2)
ax.set_xlabel('TEST');
ax.set_ylabel('JPERF');
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-20-a1b9047acab9> in <module>()
      1 try:
----> 2     jobtest_table = pd.read_table('jobtest.table')
      3 except:  # don't have data already

/usr/lib/python3/dist-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    645 
--> 646         return _read(filepath_or_buffer, kwds)
    647 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    388     # Create the parser.
--> 389     parser = TextFileReader(filepath_or_buffer, **kwds)
    390 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    729 
--> 730         self._make_engine(self.engine)
    731 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in _make_engine(self, engine)
    922         if engine == 'c':
--> 923             self._engine = CParserWrapper(self.f, **self.options)
    924         else:

/usr/lib/python3/dist-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1389 
-> 1390         self._reader = _parser.TextReader(src, **kwds)
   1391 

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4184)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:8449)()

FileNotFoundError: File b'jobtest.table' does not exist

During handling of the above exception, another exception occurred:

ConnectionRefusedError                    Traceback (most recent call last)
/usr/lib/python3.5/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1253             try:
-> 1254                 h.request(req.get_method(), req.selector, req.data, headers)
   1255             except OSError as err: # timeout error

/usr/lib/python3.5/http/client.py in request(self, method, url, body, headers)
   1106         """Send a complete request to the server."""
-> 1107         self._send_request(method, url, body, headers)
   1108 

/usr/lib/python3.5/http/client.py in _send_request(self, method, url, body, headers)
   1151             body = _encode(body, 'body')
-> 1152         self.endheaders(body)
   1153 

/usr/lib/python3.5/http/client.py in endheaders(self, message_body)
   1102             raise CannotSendHeader()
-> 1103         self._send_output(message_body)
   1104 

/usr/lib/python3.5/http/client.py in _send_output(self, message_body)
    933 
--> 934         self.send(msg)
    935         if message_body is not None:

/usr/lib/python3.5/http/client.py in send(self, data)
    876             if self.auto_open:
--> 877                 self.connect()
    878             else:

/usr/lib/python3.5/http/client.py in connect(self)
    848         self.sock = self._create_connection(
--> 849             (self.host,self.port), self.timeout, self.source_address)
    850         self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)

/usr/lib/python3.5/socket.py in create_connection(address, timeout, source_address)
    711     if err is not None:
--> 712         raise err
    713     else:

/usr/lib/python3.5/socket.py in create_connection(address, timeout, source_address)
    702                 sock.bind(source_address)
--> 703             sock.connect(sa)
    704             return sock

ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-20-a1b9047acab9> in <module>()
      3 except:  # don't have data already
      4     url = 'http://stats191.stanford.edu/data/jobtest.table'
----> 5     jobtest_table = pd.read_table(url)
      6 
      7 factor_group = jobtest_table.groupby(['ETHN'])

/usr/lib/python3/dist-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    644                     skip_blank_lines=skip_blank_lines)
    645 
--> 646         return _read(filepath_or_buffer, kwds)
    647 
    648     parser_f.__name__ = name

/usr/lib/python3/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    373     filepath_or_buffer, _, compression = get_filepath_or_buffer(
    374         filepath_or_buffer, encoding,
--> 375         compression=kwds.get('compression', None))
    376     kwds['compression'] = (inferred_compression if compression == 'infer'
    377                            else compression)

/usr/lib/python3/dist-packages/pandas/io/common.py in get_filepath_or_buffer(filepath_or_buffer, encoding, compression)
    236 
    237     if _is_url(filepath_or_buffer):
--> 238         req = _urlopen(str(filepath_or_buffer))
    239         if compression == 'infer':
    240             content_encoding = req.headers.get('Content-Encoding', None)

/usr/lib/python3.5/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    161     else:
    162         opener = _opener
--> 163     return opener.open(url, data, timeout)
    164 
    165 def install_opener(opener):

/usr/lib/python3.5/urllib/request.py in open(self, fullurl, data, timeout)
    464             req = meth(req)
    465 
--> 466         response = self._open(req, data)
    467 
    468         # post-process response

/usr/lib/python3.5/urllib/request.py in _open(self, req, data)
    482         protocol = req.type
    483         result = self._call_chain(self.handle_open, protocol, protocol +
--> 484                                   '_open', req)
    485         if result:
    486             return result

/usr/lib/python3.5/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    442         for handler in handlers:
    443             func = getattr(handler, meth_name)
--> 444             result = func(*args)
    445             if result is not None:
    446                 return result

/usr/lib/python3.5/urllib/request.py in http_open(self, req)
   1280 
   1281     def http_open(self, req):
-> 1282         return self.do_open(http.client.HTTPConnection, req)
   1283 
   1284     http_request = AbstractHTTPHandler.do_request_

/usr/lib/python3.5/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1254                 h.request(req.get_method(), req.selector, req.data, headers)
   1255             except OSError as err: # timeout error
-> 1256                 raise URLError(err)
   1257             r = h.getresponse()
   1258         except:

URLError: <urlopen error [Errno 111] Connection refused>
In [21]:
min_lm = ols('JPERF ~ TEST', data=jobtest_table).fit()
print(min_lm.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-21-0265a2007fef> in <module>()
----> 1 min_lm = ols('JPERF ~ TEST', data=jobtest_table).fit()
      2 print(min_lm.summary())

NameError: name 'jobtest_table' is not defined
In [22]:
fig, ax = plt.subplots(figsize=(6,6));
for factor, group in factor_group:
    ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
                marker=markers[factor], s=12**2)

ax.set_xlabel('TEST')
ax.set_ylabel('JPERF')
fig = abline_plot(model_results = min_lm, ax=ax)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-d9e4c8602668> in <module>()
      1 fig, ax = plt.subplots(figsize=(6,6));
----> 2 for factor, group in factor_group:
      3     ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
      4                 marker=markers[factor], s=12**2)
      5 

NameError: name 'factor_group' is not defined
In [23]:
min_lm2 = ols('JPERF ~ TEST + TEST:ETHN',
        data=jobtest_table).fit()

print(min_lm2.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-23-b316e192b8d7> in <module>()
      1 min_lm2 = ols('JPERF ~ TEST + TEST:ETHN',
----> 2         data=jobtest_table).fit()
      3 
      4 print(min_lm2.summary())

NameError: name 'jobtest_table' is not defined
In [24]:
fig, ax = plt.subplots(figsize=(6,6));
for factor, group in factor_group:
    ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
                marker=markers[factor], s=12**2)

fig = abline_plot(intercept = min_lm2.params['Intercept'],
                 slope = min_lm2.params['TEST'], ax=ax, color='purple');
fig = abline_plot(intercept = min_lm2.params['Intercept'],
        slope = min_lm2.params['TEST'] + min_lm2.params['TEST:ETHN'],
        ax=ax, color='green');
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-24-f8432b7c2f24> in <module>()
      1 fig, ax = plt.subplots(figsize=(6,6));
----> 2 for factor, group in factor_group:
      3     ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
      4                 marker=markers[factor], s=12**2)
      5 

NameError: name 'factor_group' is not defined
In [25]:
min_lm3 = ols('JPERF ~ TEST + ETHN', data = jobtest_table).fit()
print(min_lm3.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-25-d25353a3d723> in <module>()
----> 1 min_lm3 = ols('JPERF ~ TEST + ETHN', data = jobtest_table).fit()
      2 print(min_lm3.summary())

NameError: name 'jobtest_table' is not defined
In [26]:
fig, ax = plt.subplots(figsize=(6,6));
for factor, group in factor_group:
    ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
                marker=markers[factor], s=12**2)

fig = abline_plot(intercept = min_lm3.params['Intercept'],
                 slope = min_lm3.params['TEST'], ax=ax, color='purple');
fig = abline_plot(intercept = min_lm3.params['Intercept'] + min_lm3.params['ETHN'],
        slope = min_lm3.params['TEST'], ax=ax, color='green');
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-26-3e445d581db3> in <module>()
      1 fig, ax = plt.subplots(figsize=(6,6));
----> 2 for factor, group in factor_group:
      3     ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
      4                 marker=markers[factor], s=12**2)
      5 

NameError: name 'factor_group' is not defined
In [27]:
min_lm4 = ols('JPERF ~ TEST * ETHN', data = jobtest_table).fit()
print(min_lm4.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-fc62cb35928f> in <module>()
----> 1 min_lm4 = ols('JPERF ~ TEST * ETHN', data = jobtest_table).fit()
      2 print(min_lm4.summary())

NameError: name 'jobtest_table' is not defined
In [28]:
fig, ax = plt.subplots(figsize=(8,6));
for factor, group in factor_group:
    ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
                marker=markers[factor], s=12**2)

fig = abline_plot(intercept = min_lm4.params['Intercept'],
                 slope = min_lm4.params['TEST'], ax=ax, color='purple');
fig = abline_plot(intercept = min_lm4.params['Intercept'] + min_lm4.params['ETHN'],
        slope = min_lm4.params['TEST'] + min_lm4.params['TEST:ETHN'],
        ax=ax, color='green');
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-28-2572d5e3e956> in <module>()
      1 fig, ax = plt.subplots(figsize=(8,6));
----> 2 for factor, group in factor_group:
      3     ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
      4                 marker=markers[factor], s=12**2)
      5 

NameError: name 'factor_group' is not defined
In [29]:
# is there any effect of ETHN on slope or intercept?
table5 = anova_lm(min_lm, min_lm4)
print(table5)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-29-129d94cb4fbc> in <module>()
      1 # is there any effect of ETHN on slope or intercept?
----> 2 table5 = anova_lm(min_lm, min_lm4)
      3 print(table5)

NameError: name 'min_lm' is not defined
In [30]:
# is there any effect of ETHN on intercept
table6 = anova_lm(min_lm, min_lm3)
print(table6)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-30-c31cb02af7ce> in <module>()
      1 # is there any effect of ETHN on intercept
----> 2 table6 = anova_lm(min_lm, min_lm3)
      3 print(table6)

NameError: name 'min_lm' is not defined
In [31]:
# is there any effect of ETHN on slope
table7 = anova_lm(min_lm, min_lm2)
print(table7)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-31-8848959d255f> in <module>()
      1 # is there any effect of ETHN on slope
----> 2 table7 = anova_lm(min_lm, min_lm2)
      3 print(table7)

NameError: name 'min_lm' is not defined
In [32]:
# is it just the slope or both?
table8 = anova_lm(min_lm2, min_lm4)
print(table8)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-32-b62ed8a937ec> in <module>()
      1 # is it just the slope or both?
----> 2 table8 = anova_lm(min_lm2, min_lm4)
      3 print(table8)

NameError: name 'min_lm2' is not defined

One-way ANOVA

In [33]:
try:
    rehab_table = pd.read_csv('rehab.table')
except:
    url = 'http://stats191.stanford.edu/data/rehab.csv'
    rehab_table = pd.read_table(url, delimiter=",")
    rehab_table.to_csv('rehab.table')

fig, ax = plt.subplots(figsize=(8,6))
fig = rehab_table.boxplot('Time', 'Fitness', ax=ax, grid=False)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-33-f592f2c5b222> in <module>()
      1 try:
----> 2     rehab_table = pd.read_csv('rehab.table')
      3 except:

/usr/lib/python3/dist-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    645 
--> 646         return _read(filepath_or_buffer, kwds)
    647 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    388     # Create the parser.
--> 389     parser = TextFileReader(filepath_or_buffer, **kwds)
    390 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    729 
--> 730         self._make_engine(self.engine)
    731 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in _make_engine(self, engine)
    922         if engine == 'c':
--> 923             self._engine = CParserWrapper(self.f, **self.options)
    924         else:

/usr/lib/python3/dist-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1389 
-> 1390         self._reader = _parser.TextReader(src, **kwds)
   1391 

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4184)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:8449)()

FileNotFoundError: File b'rehab.table' does not exist

During handling of the above exception, another exception occurred:

ConnectionRefusedError                    Traceback (most recent call last)
/usr/lib/python3.5/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1253             try:
-> 1254                 h.request(req.get_method(), req.selector, req.data, headers)
   1255             except OSError as err: # timeout error

/usr/lib/python3.5/http/client.py in request(self, method, url, body, headers)
   1106         """Send a complete request to the server."""
-> 1107         self._send_request(method, url, body, headers)
   1108 

/usr/lib/python3.5/http/client.py in _send_request(self, method, url, body, headers)
   1151             body = _encode(body, 'body')
-> 1152         self.endheaders(body)
   1153 

/usr/lib/python3.5/http/client.py in endheaders(self, message_body)
   1102             raise CannotSendHeader()
-> 1103         self._send_output(message_body)
   1104 

/usr/lib/python3.5/http/client.py in _send_output(self, message_body)
    933 
--> 934         self.send(msg)
    935         if message_body is not None:

/usr/lib/python3.5/http/client.py in send(self, data)
    876             if self.auto_open:
--> 877                 self.connect()
    878             else:

/usr/lib/python3.5/http/client.py in connect(self)
    848         self.sock = self._create_connection(
--> 849             (self.host,self.port), self.timeout, self.source_address)
    850         self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)

/usr/lib/python3.5/socket.py in create_connection(address, timeout, source_address)
    711     if err is not None:
--> 712         raise err
    713     else:

/usr/lib/python3.5/socket.py in create_connection(address, timeout, source_address)
    702                 sock.bind(source_address)
--> 703             sock.connect(sa)
    704             return sock

ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-33-f592f2c5b222> in <module>()
      3 except:
      4     url = 'http://stats191.stanford.edu/data/rehab.csv'
----> 5     rehab_table = pd.read_table(url, delimiter=",")
      6     rehab_table.to_csv('rehab.table')
      7 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    644                     skip_blank_lines=skip_blank_lines)
    645 
--> 646         return _read(filepath_or_buffer, kwds)
    647 
    648     parser_f.__name__ = name

/usr/lib/python3/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    373     filepath_or_buffer, _, compression = get_filepath_or_buffer(
    374         filepath_or_buffer, encoding,
--> 375         compression=kwds.get('compression', None))
    376     kwds['compression'] = (inferred_compression if compression == 'infer'
    377                            else compression)

/usr/lib/python3/dist-packages/pandas/io/common.py in get_filepath_or_buffer(filepath_or_buffer, encoding, compression)
    236 
    237     if _is_url(filepath_or_buffer):
--> 238         req = _urlopen(str(filepath_or_buffer))
    239         if compression == 'infer':
    240             content_encoding = req.headers.get('Content-Encoding', None)

/usr/lib/python3.5/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    161     else:
    162         opener = _opener
--> 163     return opener.open(url, data, timeout)
    164 
    165 def install_opener(opener):

/usr/lib/python3.5/urllib/request.py in open(self, fullurl, data, timeout)
    464             req = meth(req)
    465 
--> 466         response = self._open(req, data)
    467 
    468         # post-process response

/usr/lib/python3.5/urllib/request.py in _open(self, req, data)
    482         protocol = req.type
    483         result = self._call_chain(self.handle_open, protocol, protocol +
--> 484                                   '_open', req)
    485         if result:
    486             return result

/usr/lib/python3.5/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    442         for handler in handlers:
    443             func = getattr(handler, meth_name)
--> 444             result = func(*args)
    445             if result is not None:
    446                 return result

/usr/lib/python3.5/urllib/request.py in http_open(self, req)
   1280 
   1281     def http_open(self, req):
-> 1282         return self.do_open(http.client.HTTPConnection, req)
   1283 
   1284     http_request = AbstractHTTPHandler.do_request_

/usr/lib/python3.5/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1254                 h.request(req.get_method(), req.selector, req.data, headers)
   1255             except OSError as err: # timeout error
-> 1256                 raise URLError(err)
   1257             r = h.getresponse()
   1258         except:

URLError: <urlopen error [Errno 111] Connection refused>
In [34]:
rehab_lm = ols('Time ~ C(Fitness)', data=rehab_table).fit()
table9 = anova_lm(rehab_lm)
print(table9)

print(rehab_lm.model.data.orig_exog)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-34-ba617da218fc> in <module>()
----> 1 rehab_lm = ols('Time ~ C(Fitness)', data=rehab_table).fit()
      2 table9 = anova_lm(rehab_lm)
      3 print(table9)
      4 
      5 print(rehab_lm.model.data.orig_exog)

NameError: name 'rehab_table' is not defined
In [35]:
print(rehab_lm.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-35-2b2c1df214c7> in <module>()
----> 1 print(rehab_lm.summary())

NameError: name 'rehab_lm' is not defined

Two-way ANOVA

In [36]:
try:
    kidney_table = pd.read_table('./kidney.table')
except:
    url = 'http://stats191.stanford.edu/data/kidney.table'
    kidney_table = pd.read_csv(url, delim_whitespace=True)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-36-df0ede697ea6> in <module>()
      1 try:
----> 2     kidney_table = pd.read_table('./kidney.table')
      3 except:

/usr/lib/python3/dist-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    645 
--> 646         return _read(filepath_or_buffer, kwds)
    647 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    388     # Create the parser.
--> 389     parser = TextFileReader(filepath_or_buffer, **kwds)
    390 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    729 
--> 730         self._make_engine(self.engine)
    731 

/usr/lib/python3/dist-packages/pandas/io/parsers.py in _make_engine(self, engine)
    922         if engine == 'c':
--> 923             self._engine = CParserWrapper(self.f, **self.options)
    924         else:

/usr/lib/python3/dist-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1389 
-> 1390         self._reader = _parser.TextReader(src, **kwds)
   1391 

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:4184)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:8449)()

FileNotFoundError: File b'./kidney.table' does not exist

During handling of the above exception, another exception occurred:

ConnectionRefusedError                    Traceback (most recent call last)
/usr/lib/python3.5/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1253             try:
-> 1254                 h.request(req.get_method(), req.selector, req.data, headers)
   1255             except OSError as err: # timeout error

/usr/lib/python3.5/http/client.py in request(self, method, url, body, headers)
   1106         """Send a complete request to the server."""
-> 1107         self._send_request(method, url, body, headers)
   1108 

/usr/lib/python3.5/http/client.py in _send_request(self, method, url, body, headers)
   1151             body = _encode(body, 'body')
-> 1152         self.endheaders(body)
   1153 

/usr/lib/python3.5/http/client.py in endheaders(self, message_body)
   1102             raise CannotSendHeader()
-> 1103         self._send_output(message_body)
   1104 

/usr/lib/python3.5/http/client.py in _send_output(self, message_body)
    933 
--> 934         self.send(msg)
    935         if message_body is not None:

/usr/lib/python3.5/http/client.py in send(self, data)
    876             if self.auto_open:
--> 877                 self.connect()
    878             else:

/usr/lib/python3.5/http/client.py in connect(self)
    848         self.sock = self._create_connection(
--> 849             (self.host,self.port), self.timeout, self.source_address)
    850         self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)

/usr/lib/python3.5/socket.py in create_connection(address, timeout, source_address)
    711     if err is not None:
--> 712         raise err
    713     else:

/usr/lib/python3.5/socket.py in create_connection(address, timeout, source_address)
    702                 sock.bind(source_address)
--> 703             sock.connect(sa)
    704             return sock

ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-36-df0ede697ea6> in <module>()
      3 except:
      4     url = 'http://stats191.stanford.edu/data/kidney.table'
----> 5     kidney_table = pd.read_csv(url, delim_whitespace=True)

/usr/lib/python3/dist-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    644                     skip_blank_lines=skip_blank_lines)
    645 
--> 646         return _read(filepath_or_buffer, kwds)
    647 
    648     parser_f.__name__ = name

/usr/lib/python3/dist-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    373     filepath_or_buffer, _, compression = get_filepath_or_buffer(
    374         filepath_or_buffer, encoding,
--> 375         compression=kwds.get('compression', None))
    376     kwds['compression'] = (inferred_compression if compression == 'infer'
    377                            else compression)

/usr/lib/python3/dist-packages/pandas/io/common.py in get_filepath_or_buffer(filepath_or_buffer, encoding, compression)
    236 
    237     if _is_url(filepath_or_buffer):
--> 238         req = _urlopen(str(filepath_or_buffer))
    239         if compression == 'infer':
    240             content_encoding = req.headers.get('Content-Encoding', None)

/usr/lib/python3.5/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    161     else:
    162         opener = _opener
--> 163     return opener.open(url, data, timeout)
    164 
    165 def install_opener(opener):

/usr/lib/python3.5/urllib/request.py in open(self, fullurl, data, timeout)
    464             req = meth(req)
    465 
--> 466         response = self._open(req, data)
    467 
    468         # post-process response

/usr/lib/python3.5/urllib/request.py in _open(self, req, data)
    482         protocol = req.type
    483         result = self._call_chain(self.handle_open, protocol, protocol +
--> 484                                   '_open', req)
    485         if result:
    486             return result

/usr/lib/python3.5/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    442         for handler in handlers:
    443             func = getattr(handler, meth_name)
--> 444             result = func(*args)
    445             if result is not None:
    446                 return result

/usr/lib/python3.5/urllib/request.py in http_open(self, req)
   1280 
   1281     def http_open(self, req):
-> 1282         return self.do_open(http.client.HTTPConnection, req)
   1283 
   1284     http_request = AbstractHTTPHandler.do_request_

/usr/lib/python3.5/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1254                 h.request(req.get_method(), req.selector, req.data, headers)
   1255             except OSError as err: # timeout error
-> 1256                 raise URLError(err)
   1257             r = h.getresponse()
   1258         except:

URLError: <urlopen error [Errno 111] Connection refused>

Explore the dataset

In [37]:
kidney_table.head(10)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-37-b426918048a5> in <module>()
----> 1 kidney_table.head(10)

NameError: name 'kidney_table' is not defined

Balanced panel

In [38]:
kt = kidney_table
plt.figure(figsize=(8,6))
fig = interaction_plot(kt['Weight'], kt['Duration'], np.log(kt['Days']+1),
        colors=['red', 'blue'], markers=['D','^'], ms=10, ax=plt.gca())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-38-e6704152aff6> in <module>()
----> 1 kt = kidney_table
      2 plt.figure(figsize=(8,6))
      3 fig = interaction_plot(kt['Weight'], kt['Duration'], np.log(kt['Days']+1),
      4         colors=['red', 'blue'], markers=['D','^'], ms=10, ax=plt.gca())

NameError: name 'kidney_table' is not defined

You have things available in the calling namespace available in the formula evaluation namespace

In [39]:
kidney_lm = ols('np.log(Days+1) ~ C(Duration) * C(Weight)', data=kt).fit()

table10 = anova_lm(kidney_lm)

print(anova_lm(ols('np.log(Days+1) ~ C(Duration) + C(Weight)',
                data=kt).fit(), kidney_lm))
print(anova_lm(ols('np.log(Days+1) ~ C(Duration)', data=kt).fit(),
               ols('np.log(Days+1) ~ C(Duration) + C(Weight, Sum)',
                   data=kt).fit()))
print(anova_lm(ols('np.log(Days+1) ~ C(Weight)', data=kt).fit(),
               ols('np.log(Days+1) ~ C(Duration) + C(Weight, Sum)',
                   data=kt).fit()))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-39-a2ff4752a520> in <module>()
----> 1 kidney_lm = ols('np.log(Days+1) ~ C(Duration) * C(Weight)', data=kt).fit()
      2 
      3 table10 = anova_lm(kidney_lm)
      4 
      5 print(anova_lm(ols('np.log(Days+1) ~ C(Duration) + C(Weight)',

NameError: name 'kt' is not defined

Sum of squares

Illustrates the use of different types of sums of squares (I,II,II) and how the Sum contrast can be used to produce the same output between the 3.

Types I and II are equivalent under a balanced design.

Don't use Type III with non-orthogonal contrast - ie., Treatment

In [40]:
sum_lm = ols('np.log(Days+1) ~ C(Duration, Sum) * C(Weight, Sum)',
            data=kt).fit()

print(anova_lm(sum_lm))
print(anova_lm(sum_lm, typ=2))
print(anova_lm(sum_lm, typ=3))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-40-46f87d0de498> in <module>()
      1 sum_lm = ols('np.log(Days+1) ~ C(Duration, Sum) * C(Weight, Sum)',
----> 2             data=kt).fit()
      3 
      4 print(anova_lm(sum_lm))
      5 print(anova_lm(sum_lm, typ=2))

NameError: name 'kt' is not defined
In [41]:
nosum_lm = ols('np.log(Days+1) ~ C(Duration, Treatment) * C(Weight, Treatment)',
            data=kt).fit()
print(anova_lm(nosum_lm))
print(anova_lm(nosum_lm, typ=2))
print(anova_lm(nosum_lm, typ=3))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-41-f63f83741440> in <module>()
      1 nosum_lm = ols('np.log(Days+1) ~ C(Duration, Treatment) * C(Weight, Treatment)',
----> 2             data=kt).fit()
      3 print(anova_lm(nosum_lm))
      4 print(anova_lm(nosum_lm, typ=2))
      5 print(anova_lm(nosum_lm, typ=3))

NameError: name 'kt' is not defined