# In this case, the input data will again be (x, y) tuples, but
# distributed around a straight line and we will compute linear
# regression on the data. This time, we will remove outliers
# iteratively: find the term with the greatest squared residual and
# if the squared residual is larger than ‹cutoff›-times the sum of
# all squared residuals, drop the data point and restart the
# regression. Stop when there are no more outliers.

# Feel free to use ‹pandas› and/or ‹numpy›.

def drop_outliers( data, cutoff ): # add arguments if you like
    pass # return filtered data

def regress( data, cutoff ):
    pass # remove outliers iteratively
         # return the slope and the intercept of the regression line

# NOTE: In both ‹p5› and in this exercise, we have taken a rather
# cavalier approach to outlier removal. For real statistics on real
# data, you often need to be much more careful and take the origin
# of the data set into account. Always disclose any outliers you
# have removed from further consideration.


def test_regress() -> None:
    import numpy as np
    # import matplotlib.pyplot as plt

    rng = np.random.default_rng( 1337 )
    x = np.linspace( 0, 100, 50 ) # generate 50 values between 0 and 100
    np.random.seed( 56 )
    delta = rng.normal( 0, 15, x.size )
    y = -2 * x - 1 + delta

    # add noise
    for _ in range( 4 ):
        x = np.append( x, rng.uniform( 0, 100.0 ) )
        y = np.append( y, rng.uniform( -200, 200.0 ) )

    # you can use the following to plot the data if you like
    # plt.plot( x, y, 'bo' )
    # plt.savefig( "regress.png" )

    data = list( zip( x, y ) )
    a, b = regress( data, 1/3 )
    assert -1.87 < a < -1.85
    assert -10 < b < -8
    a, b = regress( data, 1/4 )
    assert -2.06 < a < -2.05
    assert  2.41 < b < 2.42

if __name__ == "__main__":
    test_regress()