from __future__ import annotations

# Write a function that removes outliers from an otherwise normally
# distributed data set, given as a list of 2-tuples (x, y). You can
# create random inputs for testing with ‹numpy.random.normal( mean,
# stddev, count )› and then add a few outliers manually.

import numpy as np
from numpy.typing import NDArray
from typing import List, Tuple

# What exactly constitutes an outlier is somewhat domain- and
# dataset-specific, but using some small integer multiple (3-5) of
# ⟦σ⟧ (the standard deviation) as the cutoff is quite common.

# You can use pandas data frames in the implementation if you like,
# or even construct them outside and pass them to the function
# directly. Remove all outliers strictly outside the range given by
# the ‹nsigmas› argument. Return the filtered list.

def drop_outliers( data, nsigmas ):
    pass

# Now that we have a function to remove outliers, let's look at what
# effect it has. The following function should call ‹f› on both the
# original data, and the outlier-culled variant. Return a 2-tuple of
# (original data, outliers removed) where each is itself a 2-tuple
# (x, y). Apply ‹f› on each axis separately (i.e. for a dataset with
# ⟦x⟧ values ‹xs› and ⟦y⟧ values ‹ys›, return ‹f( xs ), f( ys )›).

Data = List[ Tuple[ float, float ] ]

def cmp_outliers( data: Data, nsigmas, f ):
    pass

# Try computing mean, median, quartiles and standard deviation of a
# few data sets with a more or less severe outlier problem.

def test_main() -> None:
    # x from np.random.normal( 10, 2, 20 )
    # y from np.random.normal( 16, 4, 20 )
    # df = list( zip( x, y ) )

    df = [ (11.54374531728097, 17.033772004561364),
           (11.290028460704368, 19.681696486385874),
           (7.1643136735932975, 10.870688470455766),
           (13.971510465448336, 25.84382951213484),
           (10.235362487870768, 13.335019695487416),
           (11.570182527561014, 25.942666592712968),
           (10.04348229223319, 16.647844406377086),
           (10.860219629924522, 16.32999469177061),
           (6.6509301946328065, 21.91010945935638),
           (7.180572899557317, 11.46870838514586),
           (11.825633778267322, 17.767212925501756),
           (9.00854587508819, 10.411313403825243),
           (11.126065121853257, 9.824750824415826),
           (11.495888803926471, 16.642366166951682),
           (7.623291697323285, 14.716767839462358),
           (9.903271374302527, 15.567414333088639),
           (9.814000405042467, 10.170996444018712),
           (6.495340904669167, 16.60891860720696),
           (9.27817477485331, 17.891234152688135),
           (11.150900697249186, 16.678742256710567) ]

    # Note that values within 1 or 2 ⟦σ⟧ are not typically considered
    # outliers yet this is to ensure correct functionality of your
    # implementation.

    df_out = drop_outliers( df, 1 )
    assert len( df_out ) == 10
    assert all( [ 7.8 <= x <= 12 for x,_ in df_out ] )
    assert all( [ 11.5 <= y <= 21 for _,y in df_out ] )

    df_out = drop_outliers( df, 2 )
    assert len( df_out ) == 18
    assert all( [ x <= 13.9 for x,_ in df_out ] )
    assert all( [ y <= 25.8 for _,y in df_out ] )

    # We do not have any actual outliers yet (a value further than 3
    # ⟦σ⟧ from the mean), as all values were generated from a normal
    # distribution.

    df_out = drop_outliers( df, 3 )
    assert len( df_out ) == 20

    # We manually add some outliers. Notice how the mean gets skewed
    # as a result, so some values might not be considered outliers
    # anymore if extreme values are present. Here, -1 gets us over 3
    # ⟦σ⟧ and 97 even over 4 ⟦σ⟧. Value 39, however, will be treated
    # as a non-outlier. If we repeated the process, 39 would get
    # over 3 ⟦σ⟧ and would be considered an outlier. Repeated
    # removal of outliers is of course data-specific and needs to be
    # justified. In our case the problem is too few data points. You
    # can play around with outliers on both extremes of the scale,
    # and with what this means for the mean and, consequently,
    # ⟦σ⟧.

    df.append( ( -1, 17 ) )
    df.append( ( 9, 39 ) )
    df.append( ( 2, 97 ) )
    df_out = drop_outliers( df, 3 )
    assert len( df_out ) == 21
    assert len( df ) == 23

    normal, out = cmp_outliers( df, 3, np.mean )
    x, y = normal
    assert 9 <= x <= 9.1
    assert 20.7 <= y <= 20.9

    x, y = out
    assert 9.8 <= x <= 10
    assert 17.3 <= y <= 17.5

    # Let us generate a bigger dataset with the same initial
    # properties, to make the outliers more visible. Run the program
    # multiple times and compare the output.  Make sure you
    # understand what is happening.

    x_ = iter( np.random.normal( 10, 2, 100 ) )
    y_ = iter( np.random.normal( 16, 4, 100 ) )
    df = list( zip( x_, y_ ) )

    df.append( ( -1, 17 ) )
    df.append( ( 9, 39 ) )
    df.append( ( 2, 97 ) )
    df.append( ( 10, 2 ) )
    df.append( ( -3, 84 ) )
    df.append( ( 12, 76 ) )
    df.append( ( -4, 100 ) )
    df.append( ( 1, 1 ) )
    df.append( ( -10, 98 ) )

    def quart( i: int ) -> Callable[[List[float]], np.float64]:
        def get_q( data: List[ float ] ) -> np.float64:
            return np.percentile( data, [ 25, 75 ] )[ i ]
        return get_q

    def r( data: tuple[ np.float64, np.float64 ] ) -> NDArray[ np.float64 ]:
        return np.round( data, decimals = 2 ).tolist()

    orig, out = cmp_outliers( df, 3, np.mean )
    print( 'mean:       ', r( orig ), '\n' +
           'mean (3σ):  ', r( out ), '\n' )
    orig, out = cmp_outliers( df, 3, np.median )

    print( 'median:     ', r( orig ), '\n' +
           'median (3σ):', r( out ), '\n' )

    for i in 0, 1:
        which = '1st' if i == 0 else '3rd'
        orig, out = cmp_outliers( df, 3, quart( i ) )
        print( f'{which} quart       ', r( orig ), '\n' +
               f'{which} quart (3σ): ', r( out ) )
        orig, out = cmp_outliers( df, 2, quart( i ) )
        print( f'{which} quart (2σ): ', r( out ), '\n' )

    orig, out = cmp_outliers( df, 3, np.std )
    print( 'stddev:     ', r( orig ), '\n' +
           'stddev (3σ):', r( out ), '\n' )

if __name__ == "__main__":
    test_main()