/usr/lib/python2.7/dist-packages/cogent/maths/stats/test.py is in python-cogent 1.9-9.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 | #!/usr/bin/env python
"""Provides standard statistical tests. Tests produce statistic and P-value.
"""
from __future__ import division
import warnings
from cogent.maths.stats.distribution import chi_high, z_low, z_high, zprob, \
t_high, t_low, tprob, f_high, f_low, fprob, binomial_high, binomial_low, \
ndtri
from cogent.maths.stats.special import lgam, log_one_minus, one_minus_exp,\
MACHEP
from cogent.maths.stats.ks import psmirnov2x, pkstwo
from cogent.maths.stats.kendall import pkendall, kendalls_tau
from cogent.maths.stats.special import Gamma
from numpy import absolute, arctanh, array, asarray, concatenate, transpose, \
ravel, take, nonzero, log, sum, mean, cov, corrcoef, fabs, any, \
reshape, tanh, clip, nan, isnan, isinf, sqrt, trace, exp, \
median as _median, zeros, ones
#, std - currently incorrect
from numpy.random import permutation, randint
from cogent.maths.stats.util import Numbers
from operator import add
from random import choice
__author__ = "Rob Knight"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Gavin Huttley", "Rob Knight", "Catherine Lozupone",
"Sandra Smit", "Micah Hamady", "Daniel McDonald",
"Greg Caporaso", "Jai Ram Rideout", "Michael Dwan"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Rob Knight"
__email__ = "rob@spot.colorado.edu"
__status__ = "Production"
class IndexOrValueError(IndexError, ValueError): pass
var = cov #cov will calculate variance if called on a vector
def std_(x, axis=None):
"""Returns standard deviations by axis (similiar to numpy.std)
The result is unbiased, matching the result from MLab.std
"""
x = asarray(x)
if axis is None:
d = x - mean(x)
return sqrt(sum(d**2)/(len(x)-1))
elif axis == 0:
result = []
for col in range(x.shape[1]):
vals = x[:,col]
d = vals - mean(vals)
result.append(sqrt(sum(d**2)/(len(x)-1)))
return result
elif axis == 1:
result = []
for row in range(x.shape[0]):
vals = x[row,:]
d = vals - mean(vals)
result.append(sqrt(sum(d**2)/(len(x)-1)))
return result
else:
raise ValueError, "axis out of bounds"
# tested only by std
def var(x, axis=None):
"""Returns unbiased standard deviations over given axis.
Similar with numpy.std, except that it is unbiased. (var = SS/n-1)
x: a float ndarray or asarray(x) is a float ndarray.
axis=None: computed for the flattened array by default, or compute along an
integer axis.
Implementation Notes:
Change the SS calculation from:
SumSq(x-x_bar) to SumSq(x) - SqSum(x)/n
See p. 37 of Zar (1999) Biostatistical Analysis.
"""
x = asarray(x)
#figure out sample size along the axis
if axis is None:
n = x.size
else:
n = x.shape[axis]
#compute the sum of squares from the mean(s)
sample_SS = sum(x**2, axis) - sum(x, axis)**2/n
return sample_SS / (n-1)
def std(x, axis=None):
"""computed unbiased standard deviations along given axis or flat array.
Similar with numpy.std, except that it is unbiased. (var = SS/n-1)
x: a float ndarray or asarray(x) is a float ndarray.
axis=None: computed for the flattened array by default, or compute along an
given integer axis.
"""
try:
sample_variance = var(x, axis=axis)
except IndexError, e: #just to avoid breaking the old test code
raise IndexOrValueError(e)
return sqrt(sample_variance)
def median(m, axis=None):
"""Returns medians by axis (similiar to numpy.mean)
numpy.median does not except an axis parameter. Is safe for substition for
numpy.median
"""
median_vals = []
rows, cols = m.shape
if axis is None:
return _median(ravel(m))
elif axis == 0:
for col in range(cols):
median_vals.append(_median(m[:,col]))
elif axis == 1 or axis == -1:
for row in range(rows):
median_vals.append(_median(m[row,:]))
else:
raise ValueError, "axis(=%s) out of bounds" % axis
return array(median_vals)
class ZeroExpectedError(ValueError):
"""Class for handling tests where an expected value was zero."""
pass
def G_2_by_2(a, b, c, d, williams=1, directional=1):
"""G test for independence in a 2 x 2 table.
Usage: G, prob = G_2_by_2(a, b, c, d, willliams, directional)
Cells are in the order:
a b
c d
a, b, c, and d can be int, float, or long.
williams is a boolean stating whether to do the Williams correction.
directional is a boolean stating whether the test is 1-tailed.
Briefly, computes sum(f ln f) for cells - sum(f ln f) for
rows and columns + f ln f for the table.
Always has 1 degree of freedom
To generalize the test to r x c, use the same protocol:
2*(cells - rows/cols + table), then with (r-1)(c-1) df.
Note that G is always positive: to get a directional test,
the appropriate ratio (e.g. a/b > c/d) must be tested
as a separate procedure. Find the probability for the
observed G, and then either halve or halve and subtract from
one depending on whether the directional prediction was
upheld.
The default test is now one-tailed (Rob Knight 4/21/03).
See Sokal & Rohlf (1995), ch. 17. Specifically, see box 17.6 (p731).
"""
cells = [a, b, c, d]
n = sum(cells)
#return 0 if table was empty
if not n:
return (0, 1)
#raise error if any counts were negative
if min(cells) < 0:
raise ValueError, \
"G_2_by_2 got negative cell counts(s): must all be >= 0."
G = 0
#Add x ln x for items, adding zero for items whose counts are zero
for i in filter(None, cells):
G += i * log(i)
#Find totals for rows and cols
ab = a + b
cd = c + d
ac = a + c
bd = b + d
rows_cols = [ab, cd, ac, bd]
#exit if we are missing a row or column entirely: result counts as
#never significant
if min(rows_cols) == 0:
return (0, 1)
#Subtract x ln x for rows and cols
for i in filter(None, rows_cols):
G -= i * log(i)
#Add x ln x for table
G += n * log(n)
#Result needs to be multiplied by 2
G *= 2
#apply Williams correction
if williams:
q = 1 + (( ( (n/ab) + (n/cd) ) -1 ) * ( ( (n/ac) + (n/bd) ) -1))/(6*n)
G /= q
p = chi_high(max(G,0), 1)
#find which tail we were in if the test was directional
if directional:
is_high = ((b == 0) or (d != 0 and (a/b > c/d)))
p = tail(p, is_high)
if not is_high:
G = -G
return G, p
def safe_sum_p_log_p(a, base=None):
"""Calculates p * log(p) safely for an array that may contain zeros."""
flat = ravel(a)
nz = take(flat, nonzero(flat)[0])
logs = log(nz)
if base:
logs /= log(base)
return sum(nz * logs, 0)
def G_ind(m, williams=False):
"""Returns G test for independence in an r x c table.
Requires input data as a numpy array. From Sokal and Rohlf p 738.
"""
f_ln_f_elements = safe_sum_p_log_p(m)
f_ln_f_rows = safe_sum_p_log_p(sum(m,0))
f_ln_f_cols = safe_sum_p_log_p(sum(m,1))
tot = sum(ravel(m))
f_ln_f_table = tot * log(tot)
df = (len(m)-1) * (len(m[0])-1)
G = 2*(f_ln_f_elements-f_ln_f_rows-f_ln_f_cols+f_ln_f_table)
if williams:
q = 1+((tot*sum(1.0/sum(m,1))-1)*(tot*sum(1.0/sum(m,0))-1)/ \
(6*tot*df))
G = G/q
return G, chi_high(max(G,0), df)
def calc_contingency_expected(matrix):
"""Calculates expected frequencies from a table of observed frequencies
The input matrix is a dict2D object and represents a frequency table
with different variables in the rows and columns. (observed
frequencies as values)
The expected value is calculated with the following equation:
Expected = row_total x column_total / overall_total
The returned matrix (dict2D) has lists of the observed and the
expected frequency as values
"""
#transpose matrix for calculating column totals
t_matrix = matrix.copy()
t_matrix.transpose()
overall_total = sum(list(matrix.Items))
#make new matrix for storing results
result = matrix.copy()
#populate result with expected values
for row in matrix:
row_sum = sum(matrix[row].values())
for item in matrix[row]:
column_sum = sum(t_matrix[item].values())
#calculate expected frequency
Expected = (row_sum * column_sum)/overall_total
result[row][item] = [result[row][item]]
result[row][item].append(Expected)
return result
def G_fit(obs, exp, williams=1):
"""G test for fit between two lists of counts.
Usage: test, prob = G_fit(obs, exp, williams)
obs and exp are two lists of numbers.
williams is a boolean stating whether to do the Williams correction.
SUM(2 f(obs)ln (f(obs)/f(exp)))
See Sokal and Rohlf chapter 17.
"""
k = len(obs)
if k != len(exp):
raise ValueError, "G_fit requires two lists of equal length."
G = 0
n = 0
for o, e in zip(obs, exp):
if o < 0:
raise ValueError, \
"G_fit requires all observed values to be positive."
if e <= 0:
raise ZeroExpectedError, \
"G_fit requires all expected values to be positive."
if o: #if o is zero, o * log(o/e) must be zero as well.
G += o * log(o/e)
n += o
G *= 2
if williams:
q = 1 + (k + 1)/(6*n)
G /= q
return G, chi_high(G, k - 1)
def G_fit_from_Dict2D(data):
"""G test for fit on a Dict2D
data is a dict2D. Values are a list containing the observed
and expected frequencies (can be created with calc_contingency_expected)
"""
obs_counts = []
exp_counts = []
for item in data.Items:
if len(item) == 2:
obs_counts.append(item[0])
exp_counts.append(item[1])
g_val, prob = G_fit(obs_counts, exp_counts)
return g_val, prob
def chi_square_from_Dict2D(data):
"""Chi Square test on a Dict2D
data is a Dict2D. The values are a list of the observed (O)
and expected (E) frequencies,(can be created with calc_contingency_expected)
The chi-square value (test) is the sum of (O-E)^2/E over the items in data
degrees of freedom are calculated from data as:
(r-1)*(c-1) if cols and rows are both > 1
otherwise is just 1 - the # of rows or columns
(whichever is greater than 1)
"""
test = sum([((item[0] - item[1]) * (item[0] - item[1]))/item[1] \
for item in data.Items])
num_rows = len(data)
num_cols = len([col for col in data.Cols])
if num_rows == 1:
df = num_cols - 1
elif num_cols == 1:
df = num_rows - 1
elif num_rows == 0 or num_cols == 0:
raise ValueError, "data matrix must have data"
else:
df = (len(data) - 1) * (len([col for col in data.Cols]) - 1)
return test, chi_high(test, df)
def likelihoods(d_given_h, priors):
"""Calculate likelihoods through marginalization, given Pr(D|H) and priors.
Usage: scores = likelihoods(d_given_h, priors)
d_given_h and priors are equal-length lists of probabilities. Returns
a list of the same length of numbers (not probabilities).
"""
#check that the lists of Pr(D|H_i) and priors are equal
length = len(d_given_h)
if length != len(priors):
raise ValueError, "Lists not equal lengths."
#find weighted sum of Pr(H_i) * Pr(D|H_i)
wt_sum = 0
for d, p in zip(d_given_h, priors):
wt_sum += d * p
#divide each Pr(D|H_i) by the weighted sum and multiply by its prior
#to get its likelihood
return [d/wt_sum for d in d_given_h]
def posteriors(likelihoods, priors):
"""Calculate posterior probabilities given priors and likelihoods.
Usage: probabilities = posteriors(likelihoods, priors)
likelihoods is a list of numbers. priors is a list of probabilities.
Returns a list of probabilities (0-1).
"""
#Check that there is a prior for each likelihood
if len(likelihoods) != len(priors):
raise ValueError, "Lists not equal lengths."
#Posterior probability is defined as prior * likelihood
return [l * p for l, p in zip(likelihoods, priors)]
def bayes_updates(ds_given_h, priors = None):
"""Successively apply lists of Pr(D|H) to get Pr(H|D) by marginalization.
Usage: final_probs = bayes_updates(ds_given_h, [priors])
ds_given_h is a list (for each form of evidence) of lists of probabilities.
priors is optionally a list of the prior probabilities.
Returns a list of posterior probabilities.
"""
try:
first_list = ds_given_h[0]
length = len(first_list)
#calculate flat prior if none was passed
if not priors:
priors = [1/length] * length
#apply each form of data to the priors to get posterior probabilities
for index, d in enumerate(ds_given_h):
#first, ignore the form of data if all the d's are the same
all_the_same = True
first_element = d[0]
for i in d:
if i != first_element:
all_the_same = False
break
if not all_the_same: #probabilities won't change
if len(d) != length:
raise ValueError, "bayes_updates requires equal-length lists."
liks = likelihoods(d, priors)
pr = posteriors(liks, priors)
priors = pr
return priors #posteriors after last calculation are 'priors' for next
#return column of zeroes if anything went wrong, e.g. if the sum of one of
#the ds_given_h is zero.
except (ZeroDivisionError, FloatingPointError):
return [0] * length
def t_paired(a,b, tails=None, exp_diff=0):
"""Returns t and prob for TWO RELATED samples of scores a and b.
From Sokal and Rohlf (1995), p. 354.
Calculates the vector of differences and compares it to exp_diff
using the 1-sample t test.
Usage: t, prob = t_paired(a, b, tails, exp_diff)
t is a float; prob is a probability.
a and b should be equal-length lists of paired observations (numbers).
tails should be None (default), 'high', or 'low'.
exp_diff should be the expected difference in means (a-b); 0 by default.
"""
n = len(a)
if n != len(b):
raise ValueError, 'Unequal length lists in ttest_paired.'
try:
diffs = array(a) - array(b)
return t_one_sample(diffs, popmean=exp_diff, tails=tails)
except (ZeroDivisionError, ValueError, AttributeError, TypeError, \
FloatingPointError):
return (None, None)
def t_one_sample(a,popmean=0, tails=None):
"""Returns t for ONE group of scores a, given a population mean.
Usage: t, prob = t_one_sample(a, popmean, tails)
t is a float; prob is a probability.
a should support Mean, StandardDeviation, and Count.
popmean should be the expected mean; 0 by default.
tails should be None (default), 'high', or 'low'.
"""
try:
n = len(a)
t = (mean(a) - popmean)/(std(a)/sqrt(n))
except (ZeroDivisionError, ValueError, AttributeError, TypeError, \
FloatingPointError):
return None, None
if isnan(t) or isinf(t):
return None, None
prob = t_tailed_prob(t, n-1, tails)
return t, prob
def t_two_sample(a, b, tails=None, exp_diff=0, none_on_zero_variance=True):
"""Returns t, prob for two INDEPENDENT samples of scores a, and b.
From Sokal and Rohlf, p 223.
Usage: t, prob = t_two_sample(a,b, tails, exp_diff)
t is a float; prob is a probability.
a and b should be sequences of observations (numbers). Need not be equal
lengths.
tails should be None (default), 'high', or 'low'.
exp_diff should be the expected difference in means (a-b); 0 by default.
none_on_zero_variance: if True, will return (None,None) if both a and b
have zero variance (e.g. a=[1,1,1] and b=[2,2,2]). If False, the
following values will be returned:
Two-tailed test (tails=None):
a < b: (-inf,0.0)
a > b: (+inf,0.0)
One-tailed 'high':
a < b: (-inf,1.0)
a > b: (+inf,0.0)
One-tailed 'low':
a < b: (-inf,0.0)
a > b: (+inf,1.0)
If a and b both have no variance and have the same single value (e.g.
a=[1,1,1] and b=[1,1,1]), (None,None) will always be returned.
"""
if tails is not None and tails != 'high' and tails != 'low':
raise ValueError("Invalid tail type '%s'. Must be either None, "
"'high', or 'low'." % tails)
try:
#see if we need to back off to the single-observation for single-item
#groups
n1 = len(a)
if n1 < 2:
return t_one_observation(sum(a), b, tails, exp_diff,
none_on_zero_variance=none_on_zero_variance)
n2 = len(b)
if n2 < 2:
t, prob = t_one_observation(sum(b), a, reverse_tails(tails),
exp_diff, none_on_zero_variance=none_on_zero_variance)
# Negate the t-statistic because we swapped the order of the inputs
# in the t_one_observation call, as well as tails.
if t != 0:
t = -t
return (t, prob)
#otherwise, calculate things properly
x1 = mean(a)
x2 = mean(b)
var1 = var(a)
var2 = var(b)
if var1 == 0 and var2 == 0:
# Both lists do not vary.
if x1 == x2 or none_on_zero_variance:
result = (None, None)
else:
result = _t_test_no_variance(x1, x2, tails)
else:
# At least one list varies.
df = n1+n2-2
svar = ((n1-1)*var1 + (n2-1)*var2)/df
t = (x1-x2-exp_diff)/sqrt(svar*(1/n1 + 1/n2))
if isnan(t) or isinf(t):
result = (None, None)
else:
prob = t_tailed_prob(t, df, tails)
result = (t, prob)
except (ZeroDivisionError, ValueError, AttributeError, TypeError,
FloatingPointError), e:
#invalidate if the sample sizes are wrong, the values aren't numeric or
#aren't present, etc.
result = (None, None)
return result
def _t_test_no_variance(mean1, mean2, tails):
"""Handles case where two distributions have no variance."""
if tails is not None and tails != 'high' and tails != 'low':
raise ValueError("Invalid tail type '%s'. Must be either None, "
"'high', or 'low'." % tails)
if tails is None:
if mean1 < mean2:
result = (float('-inf'), 0.0)
else:
result = (float('inf'), 0.0)
elif tails == 'high':
if mean1 < mean2:
result = (float('-inf'), 1.0)
else:
result = (float('inf'), 0.0)
else:
if mean1 < mean2:
result = (float('-inf'), 0.0)
else:
result = (float('inf'), 1.0)
return result
def mc_t_two_sample(x_items, y_items, tails=None, permutations=999,
exp_diff=0):
"""Performs a two-sample t-test with Monte Carlo permutations.
x_items and y_items must be INDEPENDENT observations (sequences of
numbers). They do not need to be of equal length.
Returns the observed t statistic, the parametric p-value, a list of t
statistics obtained through Monte Carlo permutations, and the nonparametric
p-value obtained from the Monte Carlo permutations test.
This code is partially based on Jeremy Widmann's
qiime.make_distance_histograms.monte_carlo_group_distances code.
Arguments:
x_items - the first list of observations
y_items - the second list of observations
tails - if None (the default), a two-sided test is performed. 'high'
or 'low' for one-tailed tests
permutations - the number of permutations to use in calculating the
nonparametric p-value. Must be a number greater than or equal to 0.
If 0, the nonparametric test will not be performed. In this case,
the list of t statistics obtained from permutations will be empty,
and the nonparametric p-value will be None
exp_diff - the expected difference in means (x_items - y_items)
"""
if tails is not None and tails != 'high' and tails != 'low':
raise ValueError("Invalid tail type '%s'. Must be either None, "
"'high', or 'low'." % tails)
if permutations < 0:
raise ValueError("Invalid number of permutations: %d. Must be greater "
"than or equal to zero." % permutations)
if (len(x_items) == 1 and len(y_items) == 1) or \
(len(x_items) < 1 or len(y_items) < 1):
raise ValueError("At least one of the sequences of observations is "
"empty, or the sequences each contain only a single "
"observation. Cannot perform the t-test.")
# Perform t-test using original observations.
obs_t, param_p_val = t_two_sample(x_items, y_items, tails=tails,
exp_diff=exp_diff,
none_on_zero_variance=False)
# Only perform the Monte Carlo test if we got a sane answer back from the
# initial t-test and we have been specified permutations.
nonparam_p_val = None
perm_t_stats = []
if permutations > 0 and obs_t is not None and param_p_val is not None:
# Permute observations between x_items and y_items the specified number
# of times.
perm_x_items, perm_y_items = _permute_observations(x_items, y_items,
permutations)
perm_t_stats = [t_two_sample(perm_x_items[n], perm_y_items[n],
tails=tails, exp_diff=exp_diff,
none_on_zero_variance=False)[0]
for n in range(permutations)]
# Compute nonparametric p-value based on the permuted t-test results.
if tails is None:
better = (absolute(array(perm_t_stats)) >= absolute(obs_t)).sum()
elif tails == 'low':
better = (array(perm_t_stats) <= obs_t).sum()
elif tails == 'high':
better = (array(perm_t_stats) >= obs_t).sum()
nonparam_p_val = (better + 1) / (permutations + 1)
return obs_t, param_p_val, perm_t_stats, nonparam_p_val
def _permute_observations(x_items, y_items, permutations,
permute_f=permutation):
"""Returns permuted versions of the sequences of observations.
Values are permuted between x_items and y_items (i.e. shuffled between the
two input sequences of observations).
This code is based on Jeremy Widmann's
qiime.make_distance_histograms.permute_between_groups code.
"""
num_x = len(x_items)
num_y = len(y_items)
num_total_obs = num_x + num_y
combined_obs = concatenate((x_items, y_items))
# Generate a list of all permutations.
perms = [permute_f(num_total_obs) for i in range(permutations)]
# Use random permutations to split into groups.
rand_xs = [combined_obs[perm[:num_x]] for perm in perms]
rand_ys = [combined_obs[perm[num_x:num_total_obs]] for perm in perms]
return rand_xs, rand_ys
def t_one_observation(x, sample, tails=None, exp_diff=0,
none_on_zero_variance=True):
"""Returns t-test for significance of single observation versus a sample.
Equation for 1-observation t (Sokal and Rohlf 1995 p 228):
t = obs - mean - exp_diff / (var * sqrt((n+1)/n))
df = n - 1
none_on_zero_variance: see t_two_sample for details. If sample has no
variance and its single value is the same as x (e.g. x=1 and
sample=[1,1,1]), (None,None) will always be returned
"""
try:
sample_mean = mean(sample)
sample_std = std(sample)
if sample_std == 0:
# The list does not vary.
if sample_mean == x or none_on_zero_variance:
result = (None, None)
else:
result = _t_test_no_variance(x, sample_mean, tails)
else:
# The list varies.
n = len(sample)
t = (x - sample_mean - exp_diff)/sample_std/sqrt((n+1)/n)
prob = t_tailed_prob(t, n-1, tails)
result = (t, prob)
except (ZeroDivisionError, ValueError, AttributeError, TypeError,
FloatingPointError):
result = (None, None)
return result
def pearson(x_items, y_items):
"""Returns Pearson's product moment correlation coefficient.
This will always be a value between -1.0 and +1.0. x_items and y_items must
be the same length, and cannot have fewer than 2 elements each. If one or
both of the input vectors do not have any variation, the return value will
be 0.0.
Arguments:
x_items - the first list of observations
y_items - the second list of observations
"""
x_items, y_items = array(x_items), array(y_items)
if len(x_items) != len(y_items):
raise ValueError("The length of the two vectors must be the same in "
"order to calculate the Pearson correlation "
"coefficient.")
if len(x_items) < 2:
raise ValueError("The two vectors must both contain at least 2 "
"elements. The vectors are of length %d." % len(x_items))
sum_x = sum(x_items)
sum_y = sum(y_items)
sum_x_sq = sum(x_items*x_items)
sum_y_sq = sum(y_items*y_items)
sum_xy = sum(x_items*y_items)
n = len(x_items)
try:
r = 1.0 * ((n * sum_xy) - (sum_x * sum_y)) / \
(sqrt((n * sum_x_sq)-(sum_x*sum_x))*sqrt((n*sum_y_sq)-(sum_y*sum_y)))
except (ZeroDivisionError, ValueError, FloatingPointError): #no variation
r = 0.0
#check we didn't get a naughty value for r due to rounding error
if r > 1.0:
r = 1.0
elif r < -1.0:
r = -1.0
return r
def spearman(x_items, y_items):
"""Returns Spearman's rho.
This will always be a value between -1.0 and +1.0. x_items and y_items must
be the same length, and cannot have fewer than 2 elements each. If one or
both of the input vectors do not have any variation, the return value will
be 0.0.
Arguments:
x_items - the first list of observations
y_items - the second list of observations
"""
x_items, y_items = array(x_items), array(y_items)
if len(x_items) != len(y_items):
raise ValueError("The length of the two vectors must be the same in "
"order to calculate Spearman's rho.")
if len(x_items) < 2:
raise ValueError("The two vectors must both contain at least 2 "
"elements. The vectors are of length %d." % len(x_items))
# Rank the two input vectors.
rank1, ties1 = _get_rank(x_items)
rank2, ties2 = _get_rank(y_items)
if ties1 == 0 and ties2 == 0:
n = len(rank1)
sum_sqr = sum([(x-y)**2 for x,y in zip(rank1,rank2)])
rho = 1 - (6*sum_sqr/(n*(n**2 - 1)))
else:
avg = lambda x: sum(x)/len(x)
x_bar = avg(rank1)
y_bar = avg(rank2)
numerator = sum([(x-x_bar)*(y-y_bar) for x,y in zip(rank1, rank2)])
denominator = sqrt(sum([(x-x_bar)**2 for x in rank1])*
sum([(y-y_bar)**2 for y in rank2]))
# Calculate rho. Handle the case when there is no variation in one or
# both of the input vectors.
if denominator == 0.0:
rho = 0.0
else:
rho = numerator/denominator
return rho
def _get_rank(data):
"""Ranks the elements of a list. Used in Spearman correlation."""
indices = range(len(data))
ranks = range(1,len(data)+1)
indices.sort(key=lambda index:data[index])
ranks.sort(key=lambda index:indices[index-1])
data_len = len(data)
i = 0
ties = 0
while i < data_len:
j = i + 1
val = data[indices[i]]
try:
val += 0
except TypeError:
raise(TypeError)
while j < data_len and data[indices[j]] == val:
j += 1
dup_ranks = j - i
val = float(ranks[indices[i]]) + (dup_ranks-1)/2.0
for k in range(i, i+dup_ranks):
ranks[indices[k]] = val
i += dup_ranks
ties += dup_ranks-1
return ranks, ties
def correlation(x_items, y_items):
"""Returns Pearson correlation between x and y, and its significance.
WARNING: x_items and y_items must be same length!
This function is retained for backwards-compatibility. Please use
correlation_test() for more control over how the test is performed.
"""
return correlation_test(x_items, y_items, method='pearson', tails=None,
permutations=0)[:2]
def correlation_test(x_items, y_items, method='pearson', tails=None,
permutations=999, confidence_level=0.95):
"""Computes the correlation between two vectors and its significance.
Computes a parametric p-value by using Student's t-distribution with df=n-2
to perform the test of significance, as well as a nonparametric p-value
obtained by permuting one of the input vectors the specified number of
times given by the permutations parameter. A confidence interval is also
computed using Fisher's Z transform if the number of observations is
greater than 3. Please see Sokal and Rohlf pp. 575-580 and pg. 598-601 for
more details regarding these techniques.
Warning: the parametric p-value is unreliable when the method is spearman
and there are less than 11 observations in each vector.
Returns the correlation coefficient (r or rho), the parametric p-value, a
list of the r or rho values obtained from permuting the input, the
nonparametric p-value, and a tuple for the confidence interval, with the
first element being the lower bound of the confidence interval and the
second element being the upper bound for the confidence interval. The
confidence interval will be (None, None) if the number of observations is
not greater than 3.
x_items and y_items must be the same length, and cannot have fewer than 2
elements each. If one or both of the input vectors do not have any
variation, r or rho will be 0.0.
Note: the parametric portion of this function is based on the correlation
function in this module.
Arguments:
x_items - the first list of observations
y_items - the second list of observations
method - 'pearson' or 'spearman'
tails - if None (the default), a two-sided test is performed. 'high'
for a one-tailed test for positive association, or 'low' for a
one-tailed test for negative association. This parameter affects
both the parametric and nonparametric tests, but the confidence
interval will always be two-sided
permutations - the number of permutations to use in the nonparametric
test. Must be a number greater than or equal to 0. If 0, the
nonparametric test will not be performed. In this case, the list of
correlation coefficients obtained from permutations will be empty,
and the nonparametric p-value will be None
confidence_level - the confidence level to use when constructing the
confidence interval. Must be between 0 and 1 (exclusive)
"""
# Perform some initial error checking.
if method == 'pearson':
corr_fn = pearson
elif method == 'spearman':
corr_fn = spearman
else:
raise ValueError("Invalid method '%s'. Must be either 'pearson' or "
"'spearman'." % method)
if tails is not None and tails != 'high' and tails != 'low':
raise ValueError("Invalid tail type '%s'. Must be either None, "
"'high', or 'low'." % tails)
if permutations < 0:
raise ValueError("Invalid number of permutations: %d. Must be greater "
"than or equal to zero." % permutations)
if confidence_level <= 0 or confidence_level >= 1:
raise ValueError("Invalid confidence level: %.4f. Must be between "
"zero and one." % confidence_level)
# Calculate the correlation coefficient.
corr_coeff = corr_fn(x_items, y_items)
# Perform the parametric test first.
x_items, y_items = array(x_items), array(y_items)
n = len(x_items)
df = n - 2
if n < 3:
parametric_p_val = 1
else:
try:
t = corr_coeff / sqrt((1 - (corr_coeff * corr_coeff)) / df)
parametric_p_val = t_tailed_prob(t, df, tails)
except (ZeroDivisionError, FloatingPointError):
# r/rho was presumably 1.
parametric_p_val = 0
# Perform the nonparametric test.
permuted_corr_coeffs = []
nonparametric_p_val = None
better = 0
for i in range(permutations):
permuted_y_items = y_items[permutation(n)]
permuted_corr_coeff = corr_fn(x_items, permuted_y_items)
permuted_corr_coeffs.append(permuted_corr_coeff)
if tails is None:
if abs(permuted_corr_coeff) >= abs(corr_coeff):
better += 1
elif tails == 'high':
if permuted_corr_coeff >= corr_coeff:
better += 1
elif tails == 'low':
if permuted_corr_coeff <= corr_coeff:
better += 1
else:
# Not strictly necessary since this was checked above, but included
# for safety in case the above check gets removed or messed up. We
# don't want to return a p-value of 0 if someone passes in a bogus
# tail type somehow.
raise ValueError("Invalid tail type '%s'. Must be either None, "
"'high', or 'low'." % tails)
if permutations > 0:
nonparametric_p_val = (better + 1) / (permutations + 1)
# Compute the confidence interval for corr_coeff using Fisher's Z
# transform.
z_crit = abs(ndtri((1 - confidence_level) / 2))
ci_low, ci_high = None, None
if n > 3:
try:
ci_low = tanh(arctanh(corr_coeff) - (z_crit / sqrt(n - 3)))
ci_high = tanh(arctanh(corr_coeff) + (z_crit / sqrt(n - 3)))
except (ZeroDivisionError, FloatingPointError):
# r/rho was presumably 1 or -1. Match what R does in this case.
ci_low, ci_high = corr_coeff, corr_coeff
return (corr_coeff, parametric_p_val, permuted_corr_coeffs,
nonparametric_p_val, (ci_low, ci_high))
def correlation_matrix(series, as_rows=True):
"""Returns pairwise correlations between each pair of series.
"""
return corrcoef(series, rowvar=as_rows)
#unused codes below
if as_rows:
return corrcoef(transpose(array(series)))
else:
return corrcoef(array(series))
def regress(x, y):
"""Returns coefficients to the regression line "y=ax+b" from x[] and y[].
Specifically, returns (slope, intercept) as a tuple from the regression of
y on x, minimizing the error in y assuming that x is precisely known.
Basically, it solves
Sxx a + Sx b = Sxy
Sx a + N b = Sy
where Sxy = \sum_i x_i y_i, Sx = \sum_i x_i, and Sy = \sum_i y_i. The
solution is
a = (Sxy N - Sy Sx)/det
b = (Sxx Sy - Sx Sxy)/det
where det = Sxx N - Sx^2. In addition,
Var|a| = s^2 |Sxx Sx|^-1 = s^2 | N -Sx| / det
|b| |Sx N | |-Sx Sxx|
s^2 = {\sum_i (y_i - \hat{y_i})^2 \over N-2}
= {\sum_i (y_i - ax_i - b)^2 \over N-2}
= residual / (N-2)
R^2 = 1 - {\sum_i (y_i - \hat{y_i})^2 \over \sum_i (y_i - \mean{y})^2}
= 1 - residual/meanerror
Adapted from the following URL:
http://www.python.org/topics/scicomp/recipes_in_python.html
"""
x, y = array(x,'Float64'), array(y,'Float64')
N = len(x)
Sx = sum(x)
Sy = sum(y)
Sxx = sum(x*x)
Syy = sum(y*y)
Sxy = sum(x*y)
det = Sxx * N - Sx * Sx
return (Sxy * N - Sy * Sx)/det, (Sxx * Sy - Sx * Sxy)/det
def regress_origin(x,y):
"""Returns coefficients to regression "y=ax+b" passing through origin.
Requires vectors x and y of same length.
See p. 351 of Zar (1999) Biostatistical Analysis.
returns slope, intercept as a tuple.
"""
x, y = array(x,'Float64'), array(y,'Float64')
return sum(x*y)/sum(x*x), 0
def regress_R2(x, y):
"""Returns the R^2 value for the regression of x and y
Used the method explained on pg 334 ofJ.H. Zar, Biostatistical analysis,
fourth edition. 1999
"""
slope, intercept = regress(x,y)
coords = zip(x, y)
Sx = Sy = Syy = SXY = 0.0
n = float(len(y))
for x, y in coords:
SXY += x*y
Sx += x
Sy += y
Syy += y*y
Sxy = SXY - (Sx*Sy)/n
regSS = slope * Sxy
totSS = Syy - ((Sy*Sy)/n)
return regSS/totSS
def regress_residuals(x, y):
"""reports the residual (error) for each point from the linear regression"""
slope, intercept = regress(x, y)
coords = zip(x, y)
residuals = []
for x, y in coords:
e = y - (slope * x) - intercept
residuals.append(e)
return residuals
def stdev_from_mean(x):
"""returns num standard deviations from the mean of each val in x[]"""
x = array(x)
return (x - mean(x))/std(x)
def regress_major(x, y):
"""Returns major-axis regression line of y on x.
Use in cases where there is error in both x and y.
"""
x, y = array(x), array(y)
N = len(x)
Sx = sum(x)
Sy = sum(y)
Sxx = sum(x*x)
Syy = sum(y*y)
Sxy = sum(x*y)
var_y = (Syy-((Sy*Sy)/N))/(N-1)
var_x = (Sxx-((Sx*Sx)/N))/(N-1)
cov = (Sxy-((Sy*Sx)/N))/(N-1)
mean_y = Sy/N
mean_x = Sx/N
D = sqrt((var_y + var_x)*(var_y + var_x) - 4*(var_y*var_x - (cov*cov)))
eigen_1 = (var_y + var_x + D)/2
slope = cov/(eigen_1 - var_y)
intercept = mean_y - (mean_x * slope)
return (slope, intercept)
def z_test(a, popmean=0, popstdev=1, tails=None):
"""Returns z and probability score for a single sample of items.
Calculates the z-score on ONE sample of items with mean x, given a population
mean and standard deviation (parametric).
Usage: z, prob = z_test(a, popmean, popstdev, tails)
z is a float; prob is a probability.
a is a sample with Mean and Count.
popmean should be the parametric population mean; 0 by default.
popstdev should be the parametric population standard deviation, 1 by default.
tails should be None (default), 'high', or 'low'.
"""
try:
z = (mean(a) - popmean)/popstdev*sqrt(len(a))
return z, z_tailed_prob(z, tails)
except (ValueError, TypeError, ZeroDivisionError, AttributeError, \
FloatingPointError):
return None
def z_tailed_prob(z, tails):
"""Returns appropriate p-value for given z, depending on tails."""
if tails == 'high':
return z_high(z)
elif tails == 'low':
return z_low(z)
else:
return zprob(z)
def t_tailed_prob(t, df, tails):
"""Return appropriate p-value for given t and df, depending on tails."""
if tails == 'high':
return t_high(t, df)
elif tails == 'low':
return t_low(t, df)
else:
return tprob(t,df)
def reverse_tails(tails):
"""Swaps high for low or vice versa, leaving other values alone."""
if tails == 'high':
return 'low'
elif tails == 'low':
return 'high'
else:
return tails
def tail(prob, test):
"""If test is true, returns prob/2. Otherwise returns 1-(prob/2).
"""
prob /= 2
if test:
return prob
else:
return 1 - prob
def combinations(n, k):
"""Returns the number of ways of choosing k items from n.
"""
return exp(lgam(n+1) - lgam(k+1) - lgam(n-k+1))
def multiple_comparisons(p, n):
"""Corrects P-value for n multiple comparisons.
Calculates directly if p is large and n is small; resorts to logs
otherwise to avoid rounding (1-p) to 1
"""
if p > 1e-6: #if p is large and n small, calculate directly
return 1 - (1-p)**n
else:
return one_minus_exp(-n * p)
def multiple_inverse(p_final, n):
"""Returns p_initial for desired p_final with n multiple comparisons.
WARNING: multiple_inverse is not very reliable when p_final is very close
to 1 (say, within 1e-4) since we then take the ratio of two very similar
numbers.
"""
return one_minus_exp(log_one_minus(p_final)/n)
def multiple_n(p_initial, p_final):
"""Returns number of comparisons such that p_initial maps to p_final.
WARNING: not very accurate when p_final is very close to 1.
"""
return log_one_minus(p_final)/log_one_minus(p_initial)
def fisher(probs):
"""Uses Fisher's method to combine multiple tests of a hypothesis.
-2 * SUM(ln(P)) gives chi-squared distribution with 2n degrees of freedom.
"""
try:
return chi_high(-2 * sum(map(log, probs)), 2 * len(probs))
except OverflowError, e:
return 0.0
def f_value(a,b):
"""Returns the num df, the denom df, and the F value.
a, b: lists of values, must have Variance attribute (recommended to
make them Numbers objects.
The F value is always calculated by dividing the variance of a by the
variance of b, because R uses the same approach. In f_two_value it's
decided what p-value is returned, based on the relative sizes of the
variances.
"""
if not any(a) or not any(b) or len(a) <= 1 or len(b) <= 1:
raise ValueError, "Vectors should contain more than 1 element"
F = var(a)/var(b)
dfn = len(a)-1
dfd = len(b)-1
return dfn, dfd, F
def f_two_sample(a, b, tails=None):
"""Returns the dfn, dfd, F-value and probability for two samples a, and b.
a and b: should be independent samples of scores. Should be lists of
observations (numbers).
tails should be None(default, two-sided test), 'high' or 'low'.
This implementation returns the same results as the F test in R.
"""
dfn, dfd, F = f_value(a, b)
if tails == 'low':
return dfn, dfd, F, f_low(dfn, dfd, F)
elif tails == 'high':
return dfn, dfd, F, f_high(dfn, dfd, F)
else:
if var(a) >= var(b):
side='right'
else:
side='left'
return dfn, dfd, F, fprob(dfn, dfd, F, side=side)
def ANOVA_one_way(a):
"""Performs a one way analysis of variance
a is a list of lists of observed values. Each list is the values
within a category. The analysis must include 2 or more categories(lists).
the lists must have a Mean and variance attribute. Recommende to make
the Numbers objects
An F value is first calculated as the variance of the group means
divided by the mean of the within-group variances.
"""
group_means = []
group_variances = []
num_cases = 0
all_vals = []
for i in a:
num_cases += len(i)
group_means.append(i.Mean)
group_variances.append(i.Variance * (len(i)-1))
all_vals.extend(i)
group_means = Numbers(group_means)
#get within group variances (denominator)
group_variances = Numbers(group_variances)
dfd = num_cases - len(group_means)
within_MS = sum(group_variances)/dfd
#get between group variances (numerator)
grand_mean = Numbers(all_vals).Mean
between_MS = 0
for i in a:
diff = i.Mean - grand_mean
diff_sq = diff * diff
x = diff_sq * len(i)
between_MS += x
dfn = len(group_means) - 1
between_MS = between_MS/dfn
F = between_MS/within_MS
return dfn, dfd, F, between_MS, within_MS, group_means, f_high(dfn, dfd, F)
def MonteCarloP(value, rand_values, tail = 'high'):
"""takes a true value and a list of random values as
input and returns a p-value
tail indicates which side of the distribution to look at:
low = look for smaller values than expected by chance
high = look for larger values than expected by chance
"""
pop_size= len(rand_values)
rand_values.sort()
if tail == 'high':
num_better = pop_size
for i, curr_val in enumerate(rand_values):
if value <= curr_val:
num_better = i
break
p_val = 1-(num_better / pop_size)
elif tail == 'low':
num_better = pop_size
for i, curr_val in enumerate(rand_values):
if value < curr_val:
num_better = i
break
p_val = num_better / pop_size
return p_val
def sign_test(success, trials, alt="two sided"):
"""Returns the probability for the sign test.
Arguments:
- success: the number of successes
- trials: the number of trials
- alt: the alternate hypothesis, one of 'less', 'greater', 'two sided'
(default).
"""
lo = ["less", "lo", "lower", "l"]
hi = ["greater", "hi", "high", "h", "g"]
two = ["two sided", "2", 2, "two tailed", "two"]
alt = alt.lower().strip()
if alt in lo:
p = binomial_low(success, trials, 0.5)
elif alt in hi:
success -= 1
p = binomial_high(success, trials, 0.5)
elif alt in two:
success = min(success, trials-success)
hi = 1 - binomial_high(success, trials, 0.5)
lo = binomial_low(success, trials, 0.5)
p = hi+lo
else:
raise RuntimeError("alternate [%s] not in %s" % (lo+hi+two))
return p
def ks_test(x, y=None, alt="two sided", exact = None, warn_for_ties = True):
"""Returns the statistic and probability from the Kolmogorov-Smirnov test.
Arguments:
- x, y: vectors of numbers whose distributions are to be compared.
- alt: the alternative hypothesis, default is 2-sided.
- exact: whether to compute the exact probability
- warn_for_ties: warns when values are tied. This should left at True
unless a monte carlo variant, like ks_boot, is being used.
Note the 1-sample cases are not implemented, although their cdf's are
implemented in ks.py"""
# translation from R 2.4
num_x = len(x)
num_y = None
x = zip(x, zeros(len(x), int))
lo = ["less", "lo", "low", "lower", "l", "lt"]
hi = ["greater", "hi", "high", "h", "g", "gt"]
two = ["two sided", "2", 2, "two tailed", "two", "two.sided"]
Pval = None
if y is not None: # in anticipation of actually implementing the 1-sample cases
num_y = len(y)
y = zip(y, ones(len(y), int))
n = num_x * num_y / (num_x + num_y)
combined = x + y
if len(set(combined)) < num_x + num_y:
ties = True
else:
ties = False
combined = array(combined, dtype=[('stat', float), ('sample', int)])
combined.sort(order='stat')
cumsum = zeros(combined.shape[0], float)
scales = array([1/num_x, -1/num_y])
indices = combined['sample']
cumsum = scales.take(indices)
cumsum = cumsum.cumsum()
if exact == None:
exact = num_x * num_y < 1e4
if alt in two:
stat = max(fabs(cumsum))
elif alt in lo:
stat = -cumsum.min()
elif alt in hi:
stat = cumsum.max()
else:
raise RuntimeError, "Unknown alt: %s" % alt
if exact and alt in two and not ties:
Pval = 1 - psmirnov2x(stat, num_x, num_y)
else:
raise NotImplementedError
if Pval == None:
if alt in two:
Pval = 1 - pkstwo(sqrt(n) * stat)
else:
Pval = exp(-2 * n * stat**2)
if ties and warn_for_ties:
warnings.warn("Cannot compute correct KS probability with ties")
try: # if numpy arrays were input, the Pval can be an array of len==1
Pval = Pval[0]
except (TypeError, IndexError):
pass
return stat, Pval
def _get_bootstrap_sample(x, y, num_reps):
"""yields num_reps random samples drawn with replacement from x and y"""
combined = array(list(x) + list(y))
total_obs = len(combined)
num_x = len(x)
for i in range(num_reps):
# sampling with replacement
indices = randint(0, total_obs, total_obs)
sampled = combined.take(indices)
# split into the two populations
sampled_x = sampled[:num_x]
sampled_y = sampled[num_x:]
yield sampled_x, sampled_y
def ks_boot(x, y, alt = "two sided", num_reps=1000):
"""Monte Carlo (bootstrap) variant of the Kolmogorov-Smirnov test. Useful
for when there are ties.
Arguments:
- x, y: vectors of numbers
- alt: alternate hypothesis, as per ks_test
- num_reps: number of replicates for the bootstrap"""
# based on the ks_boot method in the R Matching package
# see http://sekhon.berkeley.edu/matching/
# One important difference is I preserve the original sample sizes
# instead of making them equal
tol = MACHEP * 100
combined = array(list(x) + list(y))
observed_stat, _p = ks_test(x, y, exact=False, warn_for_ties=False)
total_obs = len(combined)
num_x = len(x)
num_greater = 0
for sampled_x, sampled_y in _get_bootstrap_sample(x, y, num_reps):
sample_stat, _p = ks_test(sampled_x, sampled_y, alt=alt, exact=False,
warn_for_ties=False)
if sample_stat >= (observed_stat - tol):
num_greater += 1
return observed_stat, num_greater / num_reps
def _average_rank(start_rank, end_rank):
ave_rank = sum(range(start_rank, end_rank+1)) / (1+end_rank-start_rank)
return ave_rank
def mw_test(x, y):
"""computes the Mann-Whitney U statistic and the probability using the
normal approximation"""
if len(x) > len(y):
x, y = y, x
num_x = len(x)
num_y = len(y)
x = zip(x, zeros(len(x), int), zeros(len(x), int))
y = zip(y, ones(len(y), int), zeros(len(y), int))
combined = x+y
combined = array(combined, dtype=[('stat', float), ('sample', int),
('rank', float)])
combined.sort(order='stat')
prev = None
start = None
ties = False
T = 0.0
for index in range(combined.shape[0]):
value = combined['stat'][index]
sample = combined['sample'][index]
if value == prev and start is None:
start = index
continue
if value != prev and start is not None:
ties = True
ave_rank = _average_rank(start, index)
num_tied = index - start + 1
T += (num_tied**3 - num_tied)
for i in range(start-1, index):
combined['rank'][i] = ave_rank
start = None
combined['rank'][index] = index+1
prev = value
if start is not None:
ave_rank = _average_rank(start, index)
num_tied = index - start + 2
T += (num_tied**3 - num_tied)
for i in range(start-1, index+1):
combined['rank'][i] = ave_rank
total = combined.shape[0]
x_ranks_sum = sum(combined['rank'][i] for i in range(total) if combined['sample'][i] == 0)
prod = num_x * num_y
U1 = prod + (num_x * (num_x+1) / 2) - x_ranks_sum
U2 = prod - U1
U = max([U1, U2])
numerator = U - prod / 2
denominator = sqrt((prod / (total * (total-1)))*((total**3 - total - T)/12))
z = (numerator/denominator)
p = zprob(z)
return U, p
def mw_boot(x, y, num_reps=1000):
"""Monte Carlo (bootstrap) variant of the Mann-Whitney test.
Arguments:
- x, y: vectors of numbers
- num_reps: number of replicates for the bootstrap
Uses the same Monte-Carlo resampling code as kw_boot
"""
tol = MACHEP * 100
combined = array(list(x) + list(y))
observed_stat, obs_p = mw_test(x, y)
total_obs = len(combined)
num_x = len(x)
num_greater = 0
for sampled_x, sampled_y in _get_bootstrap_sample(x, y, num_reps):
sample_stat, sample_p = mw_test(sampled_x, sampled_y)
if sample_stat >= (observed_stat - tol):
num_greater += 1
return observed_stat, num_greater / num_reps
def permute_2d(m, p):
"""Performs 2D permutation of matrix m according to p."""
return m[p][:, p]
#unused below
m_t = transpose(m)
r_t = take(m_t, p, axis=0)
return take(transpose(r_t), p, axis=0)
def mantel(m1, m2, n):
"""Compares two distance matrices. Reports P-value for correlation.
The p-value is based on a two-sided test.
WARNING: The two distance matrices must be symmetric, hollow distance
matrices, as only the lower triangle (excluding the diagonal) will be used
in the calculations (matching R's vegan::mantel function).
This function is retained for backwards-compatibility. Please use
mantel_test() for more control over how the test is performed.
"""
return mantel_test(m1, m2, n)[0]
def mantel_test(m1, m2, n, alt="two sided",
suppress_symmetry_and_hollowness_check=False):
"""Runs a Mantel test on two distance matrices.
Returns the p-value, Mantel correlation statistic, and a list of Mantel
correlation statistics for each permutation test.
WARNING: The two distance matrices must be symmetric, hollow distance
matrices, as only the lower triangle (excluding the diagonal) will be used
in the calculations (matching R's vegan::mantel function).
Arguments:
m1 - the first distance matrix to use in the test (should be a numpy
array or convertible to a numpy array)
m2 - the second distance matrix to use in the test (should be a numpy
array or convertible to a numpy array)
n - the number of permutations to test when calculating the p-value
alt - the type of alternative hypothesis to test (can be either
'two sided' for a two-sided test, 'greater' or 'less' for one-sided
tests)
suppress_symmetry_and_hollowness_check - by default, the input distance
matrices will be checked for symmetry and hollowness. It is
recommended to leave this check in place for safety, as the check
is fairly fast. However, if you *know* you have symmetric and
hollow distance matrices, you can disable this check for small
performance gains on extremely large distance matrices
"""
# Perform some sanity checks on our input.
if alt not in ("two sided", "greater", "less"):
raise ValueError("Unrecognized alternative hypothesis. Must be either "
"'two sided', 'greater', or 'less'.")
m1, m2 = asarray(m1), asarray(m2)
if m1.shape != m2.shape:
raise ValueError("Both distance matrices must be the same size.")
if n < 1:
raise ValueError("The number of permutations must be greater than or "
"equal to one.")
if not suppress_symmetry_and_hollowness_check:
if not (is_symmetric_and_hollow(m1) and is_symmetric_and_hollow(m2)):
raise ValueError("Both distance matrices must be symmetric and "
"hollow.")
# Get a flattened list of lower-triangular matrix elements (excluding the
# diagonal) in column-major order. Use these values to calculate the
# correlation statistic.
m1_flat, m2_flat = _flatten_lower_triangle(m1), _flatten_lower_triangle(m2)
orig_stat = pearson(m1_flat, m2_flat)
# Run our permutation tests so we can calculate a p-value for the test.
size = len(m1)
better = 0
perm_stats = []
for i in range(n):
perm = permute_2d(m1, permutation(size))
perm_flat = _flatten_lower_triangle(perm)
r = pearson(perm_flat, m2_flat)
if alt == 'two sided':
if abs(r) >= abs(orig_stat):
better += 1
else:
if ((alt == 'greater' and r >= orig_stat) or
(alt == 'less' and r <= orig_stat)):
better += 1
perm_stats.append(r)
return (better + 1) / (n + 1), orig_stat, perm_stats
def is_symmetric_and_hollow(matrix):
return (matrix.T == matrix).all() and (trace(matrix) == 0)
def _flatten_lower_triangle(matrix):
"""Returns a list containing the flattened lower triangle of the matrix.
The returned list will contain the elements in column-major order. The
diagonal will be excluded.
Arguments:
matrix - numpy array containing the matrix data
"""
matrix = asarray(matrix)
flattened = []
for col_num in range(matrix.shape[1]):
for row_num in range(matrix.shape[0]):
if col_num < row_num:
flattened.append(matrix[row_num][col_num])
return flattened
def kendall_correlation(x, y, alt="two sided", exact=None, warn=True):
"""returns the statistic (tau) and probability from Kendall's non-parametric
test of association that tau==0. Uses the large sample approximation when
len(x) >= 50 or when there are ties, otherwise it computes the probability
exactly.
Based on the algorithm implemented in R v2.5
Arguments:
- alt: the alternate hypothesis (greater, less, two sided)
- exact: when False, forces use of the large sample approximation
(normal distribution). Not allowed for len(x) >= 50.
- warn: whether to warn about tied values
"""
assert len(x) == len(y), "data (x, y) not of same length"
assert len(x) > 2, "not enough observations"
# possible alternate hypotheses arguments
lo = ["less", "lo", "lower", "l", "lt"]
hi = ["greater", "hi", "high", "h", "g", "gt"]
two = ["two sided", "2", 2, "two tailed", "two", "two.sided", "ts"]
ties = False
num = len(x)
ties = len(set(x)) != num or len(set(y)) != num
if ties and warn:
warnings.warn("Tied values, using normal approximation")
if not ties and num < 50:
exact = True
if num < 50 and not ties and exact:
combs = int(num * (num-1) / 2)
working = []
for i in range(combs):
row = [-1 for j in range(combs)]
working.append(row)
tau = kendalls_tau(x, y, False)
q = round((tau+1)*num*(num-1) / 4)
if alt in two:
if q > num * (num - 1) / 4:
p = 1 - pkendall(q-1, num, Gamma(num+1), working)
else:
p = pkendall(q, num, Gamma(num+1), working)
p = min(2*p, 1)
elif alt in hi:
p = 1 - pkendall(q-1, num, Gamma(num+1), working)
elif alt in lo:
p = pkendall(q, num, Gamma(num+1), working)
else:
tau, p = kendalls_tau(x, y, True)
if alt in hi:
p /= 2
elif alt in lo:
p = 1 - p/2
return tau, p
## Start functions for distance_matrix_permutation_test
def distance_matrix_permutation_test(matrix, cells, cells2=None,\
f=t_two_sample, tails=None, n=1000, return_scores=False,\
is_symmetric=True):
"""performs a monte carlo permutation test to determine if the
values denoted in cells are significantly different than the rest
of the values in the matrix
matrix: a numpy array
cells: a list of indices of special cells to compare to the rest of the
matrix
cells2: an optional list of indices to compare cells to. If set to None
(default), compares cells to the rest of the matrix
f: the statistical test used. Should take a "tails" parameter as input
tails: can be None(default), 'high', or 'low'. Input into f.
n: the number of replicates in the Monte Carlo simulations
is_symmetric: corrects if the matrix is symmetric. Need to only look at
one half otherwise the degrees of freedom value will be incorrect.
"""
#if matrix is symmetric convert all indices to lower trangular
if is_symmetric:
cells = get_ltm_cells(cells)
if cells2:
cells2 = get_ltm_cells(cells2)
# pull out the special values
special_values, other_values = \
get_values_from_matrix(matrix, cells, cells2, is_symmetric)
# calc the stat and parameteric p-value for real data
stat, p = f(special_values, other_values, tails)
#calc for randomized matrices
count_more_extreme = 0
stats = []
indices = range(len(matrix))
for k in range(n):
# shuffle the order of indices, and use those to permute the matrix
permuted_matrix = permute_2d(matrix,permutation(indices))
special_values, other_values = \
get_values_from_matrix(permuted_matrix, cells,\
cells2, is_symmetric)
# calc the stat and p for a random subset (we don't do anything
# with these p-values, we only use the current_stat value)
current_stat, current_p = f(special_values, other_values, tails)
stats.append(current_stat)
if tails == None:
if abs(current_stat) > abs(stat): count_more_extreme += 1
elif tails == 'low':
if current_stat < stat: count_more_extreme += 1
elif tails == 'high':
if current_stat > stat: count_more_extreme += 1
# pack up the parametric stat, parametric p, and empirical p; calc the
# the latter in the process
result = [stat, p, count_more_extreme/n]
# append the scores of the n tests if requested
if return_scores: result.append(stats)
return tuple(result)
def get_values_from_matrix(matrix, cells, cells2=None, is_symmetric=True):
"""get values from matrix positions in cells and cells2
matrix: the numpy array from which values should be taken
cells: indices of first set of requested values
cells2: indices of second set of requested values or None
if they should be randomly selected
is_symmetric: True if matrix is symmetric
"""
# pull cells values
cells_values = [matrix[i] for i in cells]
# pull cells2 values
if cells2:
cells2_values = [matrix[i] for i in cells2]
# or generate the indices and grab them if they
# weren't passed in
else:
cells2_values = []
for i, val_i in enumerate(matrix):
for j, val in enumerate(val_i):
if is_symmetric:
if (i,j) not in cells and i > j:
cells2_values.append(val)
else:
if (i,j) not in cells:
cells2_values.append(val)
return cells_values, cells2_values
def get_ltm_cells(cells):
"""converts matrix indices so all are below the diagonal
cells: list of indices into a 2D integer-indexable object
(typically a list or lists of array of arrays)
"""
new_cells = []
for cell in cells:
if cell[0] < cell[1]:
new_cells.append((cell[1], cell[0]))
elif cell[0] > cell[1]:
new_cells.append(cell)
#remove duplicates
new_cells = set(new_cells)
return list(new_cells)
## End functions for distance_matrix_permutation_test
|