“Every wall is a door.” [Ralph Waldo Emerson]

Abstract

Real life data sometimes includes extreme (outlier) values which you might want to ignore or to exclude:

sbORB

Appendix sbORB Code

Please read my Disclaimer.

Option Explicit

Function sbORB(rY As Range, rX As Range, _
    Optional dSigmaFactor As Double = 3#, _
    Optional dMaxOutlierPercentage As Double = 0.5) As Variant
'sbORB() = outlier resistant beta returns a beta and
'an alpha where y = beta * x + alpha is most accurate
'for (almost) all given x in rX and y in rY.
'"Almost" means that we successively (one by one) throw out outliers
'which have a distance of > dSigmaFactor * STDEV_of_all_Distances
'from the least square (LS) proxy.
'Source (EN): http://www.sulprobil.com/sborb_en/
'Source (DE): http://www.bplumhoff.de/sborb_de/
'(C) (P) by Bernd Plumhoff 24-Jun-2007 PB V0.9
Dim vLinEst As Variant 'store LinEst() result of recent LS proxy during iterations
Dim dm2 As Double 'ortogonal slope to recent LS proxy
Dim dc As Double 'Constant c in: y2=m2*x2+c which is ortogonal to LS proxy through a given point
Dim dx2 As Double 'x2 in: y2 = m2 * x2 + c which is ortogonal to LS proxy through a given point
Dim dy2 As Double 'y2 in: y2 = m2 * x2 + c which is ortogonal to LS proxy through a given point
Dim i As Long, j As Long
Dim lcount As Long 'holds current number of live points
Dim lcount_orig As Long 'original (starting) number of points
Dim lcount_old As Long 'holds number of live points of previous iteration
Dim daverage As Double 'average of distances to LS proxy of current iterations' live points
Dim dstdev As Double 'Stdev of distances to LS proxy of current iterations' live points
Dim dDistMax As Double
Dim lDistMaxIdx As Long

lcount = rX.Rows.Count
If rX.Columns.Count > lcount Then
    lcount = rX.Columns.Count
End If
lcount_orig = lcount
lcount_old = lcount + 1

ReDim dDist(1 To lcount) As Double 'store distances of live points to recent LS proxy (line)
ReDim dX(1 To lcount) As Double
ReDim dY(1 To lcount) As Double    'store coordinates of "live" points during iterations

'read data row-wise or column-wise
If rX.Rows.Count > rX.Columns.Count Then
    For i = 1 To lcount
        dX(i) = rX.Cells(i, 1)
        dY(i) = rY.Cells(i, 1)
    Next i
Else
    For i = 1 To lcount
        dX(i) = rX.Cells(1, i)
        dY(i) = rY.Cells(1, i)
    Next i
End If

Do
    lcount_old = lcount
    ReDim Preserve dDist(1 To lcount) As Double 'Store distances of live points to last LS proxy
    ReDim Preserve dX(1 To lcount) As Double
    ReDim Preserve dY(1 To lcount) As Double 'Store coordinates of "live" points during iterations
    vLinEst = Application.WorksheetFunction.LinEst(dY, dX, True, True)
    dDistMax = 0#
    lDistMaxIdx = 1
    For i = 1 To lcount
        'Calculate distances of live points to recent LS proxy
        dm2 = -1# / vLinEst(1, 1)
        dc = dY(i) - dX(i) * dm2
        dx2 = (dc - vLinEst(1, 2)) / (vLinEst(1, 1) - dm2)
        dy2 = dm2 * dx2 + dc
        dDist(i) = Sqr((dX(i) - dx2) * (dX(i) - dx2) + (dY(i) - dy2) * (dY(i) - dy2))
        'remember largest distance and its index
        If dDist(i) > dDistMax Then
            dDistMax = dDist(i)
            lDistMaxIdx = i
        End If
    Next i
    'calculate average and standard deviation of live points' distances to LS proxy
    daverage = Application.WorksheetFunction.Average(dDist)
    dstdev = Application.WorksheetFunction.StDev(dDist)
'    'kill points with distance > dSigmaFactor * dstdev 'Attention: might erase too many points
'    j = 1
'    For i = 1 To lcount
'        If dDist(i) <= dstdev * dSigmaFactor Then
'            dX(j) = dX(i)
'            dY(j) = dY(i)
'            j = j + 1
'        Else
'            Debug.Print "Lcount: " & lcount & ". Throwing out (" & dX(i) & ";" & dY(i) & ")"
'        End If
'    Next i
'    lcount = j - 1
    'kill point with largest distance > dSigmaFactor * dstdev
    If dDist(lDistMaxIdx) >= dstdev * dSigmaFactor Then
        Debug.Print "Lcount: " & lcount & ". Throwing out (" & dX(lDistMaxIdx) & _
                    ";" & dY(lDistMaxIdx) & ")"
        dX(lDistMaxIdx) = dX(lcount)
        dY(lDistMaxIdx) = dY(lcount)
        lcount = lcount - 1
    End If
Loop While lcount_old > lcount And lcount / lcount_orig > 1# - dMaxOutlierPercentage

If lcount < lcount_old Then
    vLinEst = Application.WorksheetFunction.LinEst(dY, dX, True, True)
End If

sbORB = vLinEst

End Function

Download

Please read my Disclaimer.

sbORB.xlsm [24 KB Excel file, open and use at your own risk]