;; @module stat.lsp ;; @description Basic statistics and plotting library ;; @version 3.0 - Eliminated plot functions and f-prob which now is built-in as prob-f ;; @version 3.1 - Documentation changes ;; @version 3.2 - Documentation, mention built-in 'stats' function since 10.4.2 ;; ;; @author Lutz Mueller, 2001-2014 ;;

Functions for statistics

;; To use this module it has to be loaded at the beginning of the ;; program file: ;;
;; (load (append (env "NEWLISPDIR") "/modules/stat.lsp"))
;; ; or shorter
;; (module "stat.lsp")
;; 
;; All functions work on integers and floats or a mix of both. are normal ;; LISP lists. are lists of lists, one list for each row in the ;; two dimensional data matrix. See the function 'stat:matrix' on how to make matrices ;; from lists. ;; ;; In version 3.0 of 'stat.lsp' the usage for Gnuplot the stat:plot and stat:plotXY ;; functions has been eliminated. Instead use the module 'plot.lsp' shipped with all binary and ;; source distributions. The F-distribution function 'f-prob' has also been eliminated ;; instead use one of new the built-in 'prob-f' or 'crit-f' functions. ;; ;; The documention contains only the call patterns. See the source for more ;; documentation. ;; ;;

Summary of functions


;;

General uni- and bi- variate statistics

;;
;; stat:sum      - sum of a vector of numbers (see also built-in stats since 10.4.2)
;; stat:mean     - arithmetik mean of a vector of numbers (see also built-in stats since 10.4.2)
;; stat:var      - estimated variance of numbers in a vector sample
;; stat:sdev     - estimated standard deviation of numbers in a vector (see also built-in stats since 10.4.2)
;; stat:sum-sq   - sum of squares of a data vector
;; stat:sum-xy   - sum of products of a two data vectors
;; stat:corr     - correlation coefficient between two vectors (built-in since 10.4.2)
;; stat:cov      - covariance of two number vectors
;; stat:sum-d2   - sum of squared differences of a vector from its mean
;; stat:sum-d2xy - sum of squared diffferences of two vectors
;; stat:regression - calculates the intecept and slope of a regression estimate
;; stat:fit      - return the fitted line using regression coefficients
;; stat:moments  - calulates 1st to 3rd moments from a vector of numbers
;; 
;;

Multi variate statistics

;;
;; stat:multiple-reg  - calculates a multiple regression
;; stat:cov-matrix    - calculates a covariance matrix
;; stat:corr-matrix   - calculates a correlation matrix
;; 
;;

Time series

;;
;; stat:smooth   - smoothes a vector of numbers
;; stat:lag      - calcultes a difference list with specified lag
;; stat:cumulate - cumulate a data vector
;; stat:power    - calculate the power spectrum of a time series
;; 
;;

Matrix and list utilities

;;
;; stat:matrix       - make a matrix from column vectors
;; stat:diagonal     - make a diagonal matrix
;; stat:get-diagonal - return the diagonal of a matrix in a vector
;; stat:mat-map      - map a binary function on to matrices
;; 
(if (< (sys-info -2) 10111) (constant (global 'term) name)) ;; @syntax (stat:corr ) ;; @param A list of numbers. ;; @param A list of numbers. ;; @return Correlation coefficient of lists and . ;; A 'corr' native function is built into newLISP since 10.4.2. ;; @syntax (stat:cov ) ;; @param A list of numbers. ;; @param A list of numbers. ;; @return Covariance of data in lists and ;; @syntax (stat:cov-matrix ) ;; @param A matrix of numbers. ;; @return Covariance matrix of with rows and columns. ;; @syntax (stat:corr-matrix ) ;; @param A matrix of numbers. ;; @return Correlation matrix of with rows and columns. ;; @syntax (stat:cumulate ) ;; @param A list of numbers. ;; @return The cumulated list of . ;; @syntax (stat:diagonal ) ;; @param The diagonal element. ;; @return A diagonal matrix of length with in the diagonal. ;; @syntax (stat:fit ) ;; @param A list of numbers. ;; @param A list of numbers. ;; @return fitted line based on '(stat:regression X Y)'. ;; @syntax (stat:f-prob ) ;; @param The variance ratio. ;; @param Degrees of freedom. ;; @param Degrees of freedom. ;; @return Probablity of F variance ratio for , degress of freedom. ;; @syntax (stat:get-diagonal ) ;; @param An matrix filled with numbers. ;; @return A list from the diagonal elements of . ;; @syntax (stat:lag ) ;; @param A list of numbers. ;; @param Lag n. ;; @return A differenced list of with a lag of . ;; If the length of list is then the length of the resulting ;; differenced list is . ;; @syntax (stat:mat-map ) ;; @return Matrix map, e.g. '(stat:mat-map + A B)'. ;; Used for adding and subtracting matrices. ;; @syntax (stat:matrix .... ) ;; @param The first column list of values. ;; @param The Nth column list of values. ;; @return A matrix off <1> to columns . ;; @syntax (stat:mean ) ;; @param A list of numbers. ;; @return The mean of data in list . ;; @syntax (stat:moments ) ;; @param A list of numbers. ;; @return Calculates all moments of list . ;; @syntax (stat:multiple-reg ) ;; @param A matrix of numbers. ;; @param Zero based offset into . ;; @return Multiple regression of vars in onto at . ;; @syntax (stat:power ) ;; @param A time series of numbers. ;; @return The power spectrum of a time series ;; @syntax (stat:regression ) ;; @param A list of numbers. ;; @param A list of numbers. ;; returns <(b0 b1)> coefficients of regression . ;; @syntax (stat:sdev ) ;; @param A list of numbers. ;; @return Standard deviation of data in list . ;; @syntax (stat:smooth ) ;; @param A list of numbers. ;; @param Smoothing coefficient <0 < alpha < 1>. ;; @return Exponentially smoothed sequence in . ;; @syntax (stat:sum ) ;; @param A list of numbers, ;; @return Sum of data in list . ;; @syntax (stat:sum-d2 ) ;; @param A list of numbers. ;; @return Sum of squared diffs <(x - mean(X))^2> in list . ;; @syntax (stat:sum-d2xy ) ;; @return Sum of squared differences <(x - y)^2> of elements in lists and . ;; @syntax (stat:sum-sq ) ;; @param A list of numbers. ;; @return Sum of data elements in list . ;; @syntax (stat:sum-xy ) ;; @param A list of numbers. ;; @param A list of numbers. ;; @return Sum of products data elements in lists and . ;; @syntax (stat:var ) ;; @param A list of numbers. ;; @return The variance of the data in list . (context 'stat) ;------------------- General uni and bi-variate statistics -------------------- ; sum of a data vector X (define (sum X) (apply add X)) ; mean of a data vector X (define (mean X) (div (sum X) (length X))) ; variance of a data vector X (define (var X) (div (sum-d2 X) (sub (length X) 1))) ; standard deviation of a data vector X (define (sdev X) (sqrt (var X))) ; sum of squares of a data vector X (define (sum-sq X) (apply add (map mul X X))) ; sum of the product of data vectors X*Y (define (sum-xy X Y) (apply add (map mul X Y))) ; covariance of data vectors X Y (define (cov X Y) (sub (sum-xy X Y) (div (mul (sum X) (sum Y)) (length X)))) ; sum of sqared differenses of X to mean of X (define (sum-d2 X) (sub (sum-sq X) (div (mul (sum X) (sum X)) (length X)))) ; Pearson r, product moment correlation of data vectors X and Y (define (stat:corr X Y) (div (cov X Y) (sqrt (mul (sum-d2 X) (sum-d2 Y))))) ; regression Yest = b0 + b1*X calculates intercept b0 and slope b1 (define (regression X Y) (set 'b1 (div (cov X Y) (sum-d2 X))) (set 'b0 (sub (mean Y) (mul b1 (mean X)))) (list b0 b1)) ; fitted line using regression Y on X (define (fit X Y, coeffs b0 b1) (set 'coeffs (regression X Y)) (set 'b0 (first coeffs)) (set 'b1 (last coeffs)) (map (lambda (x) (add b0 (mul x b1))) X)) ; sum of squared differences of X and Y (define (sum-d2xy X Y) (apply add (map (lambda (x y) (mul (sub x y) (sub x y))) X Y))) ; moments of a vector of numbers ; (define (moments vector, n median mean avg-dev std-dev var skew kurtosis dev sum) (set 'n (length vector)) (set 'sum (apply add vector)) (set 'mean (div sum n)) (set 'avg-dev 0 'std-dev 0 'var 0 'skew 0 'kurtosis 0) (set 'dev (map sub vector (dup mean n))) (set 'avg-dev (div (apply add (map abs dev)) n)) (set 'var (div (apply add (map mul dev dev)) (- n 1))) (set 'skew (apply add (map mul dev dev dev))) (set 'kurtosis (apply add (map mul dev dev dev dev))) (set 'std-dev (sqrt var)) (if (> var 0.0) (begin (set 'skew (div skew (mul n var std-dev))) (set 'kurtosis (sub (div kurtosis (mul n var var)) 3.0)))) (sort vector) (set 'mid (/ n 2)) (if (= 0 (% n 2)) (set 'median (div (add (nth mid vector) (nth (- mid 1) vector)) 2)) (set 'median (nth mid vector))) (list n median mean avg-dev std-dev var skew kurtosis) ; (println (format "n: %d" n)) ; (println (format "median: %f" median)) ; (println (format "mean: %f" mean)) ; (println (format "average_deviation: %f" avg-dev)) ; (println (format "standard_deviation: %f" std-dev)) ; (println (format "variance: %f" var)) ; (println (format "skew: %f" skew)) ; (println (format "kurtosis: %f" kurtosis)) ) ;-------------------------------- Time Series ---------------------------------- ; expontial smoothing with 0 < alpha <= 1 (define (smooth lst alpha , previous slist) (set 'previous (first lst)) (set 'slist '()) (dolist (elmnt lst) (set 'previous (add (mul alpha elmnt) (mul (sub 1 alpha) previous))) (push previous slist)) (reverse slist)) ; could be written shorter starting v.9.9.5 ; because push returns the modified list ; ; seasonal difference list with variable lag ; the resulting list is lag shorterm than the original ; (define (lag lst n , sLst) (set 'sLst lst) (dotimes (i n) (pop lst)) (set 'sLst (slice sLst 0 (length lst))) (map sub lst sLst)) ; ; cumulate of a list ; (define (cumulate lst , sc cum) (set 'sc 0.0) (set 'cum '()) (dolist (x lst) (push (inc sc x) cum)) (reverse cum)) ; could be written shorter after 9.9.5 ; because push returns the list ; ; power spectrum ; ; takes a rows by 2 columns matrix with real part in the first and ; imagenary part the in the second column. If all numbers are real ; then the second column is just 0's. ; ; returns a matrix with two rows. First row contains power numbers ; and second row contains the respective frequencies ; (define (power ts , lenOrg fts n n2 ps mid frqs) ; remember original length (set 'lenOrg (length ts)) ; do discrete fourier transform (set 'fts (transpose (fft ts))) ; calc power spectrum (set 'n (length (transpose fts))) (set 'n2 (mul n n)) (set 'ps (map (lambda (x y) (add (mul x x) (mul y y))) (nth 0 fts) (nth 1 fts))) (set 'ps (map (lambda (x) (mul (div x n2) 2)) ps)) ; the first and last are not multiplied by 2, divide them back ; use deprecated nth-set in versions older than 9.9.02 (setf (ps 0) (div (first ps) 2)) (set 'mid (sub (div n 2) 1)) (replace mid ps (div (nth mid ps) 2)) ; calc a vector with frequencies, adjusted for the new power-2 length ; which came back from the FFT (set 'frqs (sequence 0 (- n 1))) (set 'frqs (map (lambda (x) (mul (div x n) lenOrg)) frqs)) (transpose (matrix ps frqs))) ;------------------------- multivariate statistics ----------------------------- ; ; multiple regression of variables in X onto one of variables in X, Y ; indicated by column offset offY ; ; X is N rows by k columns, the column at offset offY is Y ; ; returns a matrix with two rows: ; first row is regression coefficients and multiple R: b0, b1, b2 ....., R ; second row is sum of squares: regression-SQ, error-SQ, total-SQ ; (the unused part of the second row is nil padded) ; ; the SQs can be used to calculate mean sqares for regression and error: ; ; regression-MSQ = regression-SQ / (k - 1) ; error-MSQ = error-sq / (n - k - 1) ; ; F-ratio = regression-MSQ / error-MSQ ; with k and (n - k - 1) df degreees of freedom ; ; ; ; (define (multiple-reg X offY , Y Ycoffs b b0 R2 Yest sqErr sqTotal sqReg sq d) (set 'covX (cov-matrix X)) (set 'Y (extract-col X offY)) ; covX is the covariance matrix (pop covX offY) (set 'cvX (transpose covX)) ; the covariance matrix is reduced to cvX and the ; extracted values put in Ycoffs (set 'Ycoffs (matrix (pop cvX offY))) ; b contains the regression coefficients except for b0 (set 'b (multiply (invert cvX) Ycoffs)) ; calculate multiple R2 as b'*b / sqTotal (set 'sqTotal (sum-d2 Y)) (set 'R2 (div (first (first (multiply (transpose b) Ycoffs))) sqTotal)) ; estimate Y without b0 (set 'Yest (multiply (reduce-col X offY) b)) ; calculate b0, d is the difference between Y and the Y estimate ; b0 is the mean of differences between Y and Yest (set 'd (mat-map sub (matrix Y) Yest)) (set 'b0 (mean (first (transpose d)))) ; estimate Y including b0 (set 'Yest (mat-map add Yest (matrix (dup b0 (length Yest))))) ; error sum of squares (set 'sqErr (sum-d2xy Y (first (transpose Yest)))) ; regression sum of squares (set 'sqReg (sub sqTotal sqErr)) ; make list b out of b0, b1, b2 ... sqrt(R2) (set 'b (append (list b0) (first (transpose b)) (list (sqrt R2)))) ; make list sq out of sqReg, sqErr and sqTotal (set 'sq (list sqReg sqErr sqTotal)) ; return matrix with two rows: (transpose (matrix b sq))) ; ; covariance matrix cov ; ; matrix x with N rows and k columns ; ; (define (cov-matrix X , XtX N I sumX sumX2) (set 'XtX (multiply (transpose X) X)) (set 'N (length X)) (set 'I (matrix (dup 1 N))) (set 'sumX (multiply (transpose X) I)) (set 'sumX2 (multiply sumX (transpose sumX))) (set 'sumX2 (multiply sumX2 (diagonal (div 1 N) (length sumX2)))) (mat-map sub XtX sumX2)) ; ; correlation matrix ; ; matrix X with N rows and k columns ; ; (define (corr-matrix X , covX N d dd) (set 'covX (cov-matrix X)) (set 'd (matrix (get-diagonal covX))) (set 'dd (multiply d (transpose d))) (set 'dd (map (lambda (z) (map sqrt z)) dd)) (mat-map div covX dd)) ; ; probablity of F variance ratio with degrees of freedom df1 df2 ; ; (define (f-prob F df1 df2) (let (prob (mul 2 (betai (div df2 (add df2 (mul df1 F))) (mul 0.5 df2) (mul 0.5 df1)))) (div (if (> prob 1) (sub 2 prob) prob) 2))) ;----------------------------- utility functions ------------------------------- ; ; make a matrix from 1 up to 16 lists ; (define (matrix) (transpose (args))) ; ; make a diagonal matrix n by n and elmnt in the diagonal ; ; (define (diagonal elmnt n, m lst) (set 'm '()) (dotimes (i n) (set 'lst (dup 0 n)) ; use deprecated nth-set in versions older than 9.9.02 (if (< (sys-info -2) 9902) (nth-set (lst i) elmnt) (setf (lst i) elmnt)) (push lst m)) (reverse m)) ; make shorter starting v.9.9.5 ; ; get the diagonal from a square matrix ; (define (get-diagonal X , d x) (set 'd '()) (dotimes (idx (length X)) (push (nth idx (nth idx X)) d)) (reverse d)) ; ; matrix map ; ; e.g.: (mat-map sub A B) ;; for matrix subtraction ; (define (mat-map op A B) (map (lambda (x y) (map op x y)) A B)) ; ; reduce matrix by a column at offset ; ; returns the reduced matrix ; (define (reduce-col matr off, X) (set 'matr (transpose matr)) (pop matr off) (transpose matr)) ; ; extract a column from a matrix ; ; returns the extracted column ; (define (extract-col matr off, X) (pop (transpose matr) off)) ; ; convert list to ascii lines terminated by CR-LF ; for storage in files usable by Gnuplot, R, Excel etc. ; ; example: ; ; (write-file "MyData.txt" (list2ascii mydata-list)) ; (define (list2ascii lst) (append (join (map string lst) "\r\n") "\r\n")) ; eof