Skip to content

Commit d905265

Browse files
author
Tomasz Latkowski
committed
added Pearson correlation
1 parent 4619b99 commit d905265

File tree

3 files changed

+68
-16
lines changed

3 files changed

+68
-16
lines changed

methods/selection.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,72 @@
1-
import tensorflow as tf
21
import pandas as pd
2+
import tensorflow as tf
33

44
data_file = '../data/autism.tsv'
55
df = pd.read_csv(data_file, sep='\t', header=None, index_col=0).T
66

77

8-
def fisher(data, num_instances: list, top_k=10):
8+
def fisher(data, num_instances: list, top_k_features=10):
99
"""
1010
Performs Fisher feature selection method according to the following formula:
1111
D(f) = (m1(f) - m2(f) / (std1(f) - std2(f))
1212
1313
:param data:
1414
:param num_instances:
15-
:param top_k:
15+
:param top_k_features:
1616
:return: the list of most significant features.
1717
"""
1818
assert len(num_instances) == 2, "Fisher selection method can be performed for two-class problems."
19+
data = tf.convert_to_tensor(data)
20+
_, num_features = data.get_shape().as_list()
21+
if top_k_features < num_features:
22+
top_k_features = num_features
1923
class1, class2 = tf.split(data, num_instances)
2024
mean1, std1 = tf.nn.moments(class1, axes=0)
2125
mean2, std2 = tf.nn.moments(class2, axes=0)
22-
fisher_coeffs = tf.abs((mean1 - mean2)) / (std1 + std2)
23-
return tf.nn.top_k(fisher_coeffs, k=top_k)
26+
fisher_coeffs = tf.abs(mean1 - mean2) / (std1 + std2)
27+
return tf.nn.top_k(fisher_coeffs, k=top_k_features)
2428

2529

26-
def feature_correlation_with_class(data, num_instances: list, top_k=10):
30+
def feature_correlation_with_class(data, num_instances: list, top_k_features=10):
2731
"""
2832
Makes feature correlation with class selection according to the following formula:
2933
D(f) = [(m1(f) - m(f))^2 + (m2(f) - m(f))^2] / 2*sigma(f)^2
3034
:return: the list of most significant features.
3135
"""
3236
data = tf.convert_to_tensor(data)
37+
_, num_features = data.get_shape().as_list()
38+
if top_k_features < num_features:
39+
top_k_features = num_features
3340
class1, class2 = tf.split(data, num_instances)
3441
mean1, std1 = tf.nn.moments(class1, axes=0)
3542
mean2, std2 = tf.nn.moments(class2, axes=0)
3643
mean, std = tf.nn.moments(data, axes=0)
37-
corr_coeffs = (tf.square(mean1 - mean) + tf.square(mean2 - mean)) / 2*tf.square(std) # FIXME sth is wrong
38-
return tf.nn.top_k(corr_coeffs, k=top_k)
44+
corr_coeffs = (tf.square(mean1 - mean) + tf.square(mean2 - mean)) / 2*tf.square(std)
45+
return tf.nn.top_k(corr_coeffs, k=top_k_features)
3946

4047

41-
def t_test(data, num_instances: list, top_k=10):
48+
def t_test(data, num_instances: list, top_k_features=10):
4249
"""
4350
Makes feature correlation with class selection according to the following formula:
4451
D(f) = [(m1(f) - m(f))^2 + (m2(f) - m(f))^2] / 2*sigma(f)^2
4552
:return: the list of most significant features.
4653
"""
54+
data = tf.convert_to_tensor(data)
55+
_, num_features = data.get_shape().as_list()
56+
if top_k_features < num_features:
57+
top_k_features = num_features
4758
class1, class2 = tf.split(data, num_instances)
4859
mean1, std1 = tf.nn.moments(class1, axes=0)
4960
mean2, std2 = tf.nn.moments(class2, axes=0)
50-
t_test_coeffs = (mean1 - mean2) / tf.sqrt(tf.square(std1)/num_instances[0] + tf.square(std2) / num_instances[1])
51-
return tf.nn.top_k(t_test_coeffs, k=top_k)
61+
t_test_coeffs = tf.abs(mean1 - mean2) / tf.sqrt(tf.square(std1)/num_instances[0] + tf.square(std2) / num_instances[1])
62+
return tf.nn.top_k(t_test_coeffs, k=top_k_features)
5263

5364
with tf.Session() as session:
5465
input_data = df.as_matrix()
5566
instances_per_class = [82, 64]
56-
fisher_coeffs = session.run(fisher(data=input_data, num_instances=instances_per_class))
57-
corr_coeffs = session.run(feature_correlation_with_class(data=input_data, num_instances=instances_per_class))
58-
t_test_coeff = session.run(t_test(data=input_data, num_instances=instances_per_class))
67+
fisher_coeffs = session.run(fisher(data=input_data, num_instances=instances_per_class, top_k_features=5))
68+
corr_coeffs = session.run(feature_correlation_with_class(data=input_data, num_instances=instances_per_class, top_k_features=5))
69+
t_test_coeff = session.run(t_test(data=input_data, num_instances=instances_per_class, top_k_features=5))
5970
print(fisher_coeffs)
6071
print(corr_coeffs)
6172
print(t_test_coeff)

tests/pearson.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import tensorflow as tf
2+
from utils.statistics import pearson_correlation
3+
import numpy as np
4+
5+
6+
class PearsonTest(tf.test.TestCase):
7+
8+
def testPearsonCoefficientValueForTwoVectors(self):
9+
with self.test_session() as test_session:
10+
x1 = np.array([2., 3., 4.])
11+
x2 = np.array([3., 1., 5.])
12+
actual_pearson_coefficient = test_session.run(pearson_correlation(x1, x2))
13+
correct_pearson_coefficient = tf.constant([.5])
14+
self.assertEqual(actual_pearson_coefficient, correct_pearson_coefficient.eval())
15+
16+
def testNegativePearsonCoefficientValueForTwoVectors(self):
17+
with self.test_session() as test_session:
18+
x1 = np.array([1., 2., 3.])
19+
x2 = np.array([-1., -2., -3.])
20+
actual_pearson_coefficient = test_session.run(pearson_correlation(x1, x2))
21+
correct_pearson_coefficient = tf.constant([-1.])
22+
self.assertEqual(actual_pearson_coefficient, correct_pearson_coefficient.eval())
23+
24+
def testPositivePearsonCoefficientValueForTwoVectors(self):
25+
with self.test_session() as test_session:
26+
x1 = np.array([1., 2., 3.])
27+
x2 = np.array([1., 2., 3.])
28+
actual_pearson_coefficient = test_session.run(pearson_correlation(x1, x2))
29+
correct_pearson_coefficient = tf.constant([1.])
30+
self.assertEqual(actual_pearson_coefficient, correct_pearson_coefficient.eval())
31+
32+
if __name__ == '__main__':
33+
tf.test.main()

utils/statistics.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
1+
import tensorflow as tf
12

23

3-
def pearson_correlation():
4-
pass
4+
def pearson_correlation(x1, x2):
5+
x1 = tf.convert_to_tensor(x1)
6+
x2 = tf.convert_to_tensor(x2)
7+
m1, std1 = tf.nn.moments(x1, axes=0)
8+
m2, std2 = tf.nn.moments(x2, axes=0)
9+
l = tf.reduce_sum((x1 - m1) * (x2 - m2))
10+
i = tf.reduce_sum((x1 - m1) ** 2) * tf.reduce_sum((x2 - m2) ** 2)
11+
p = tf.sqrt(i)
12+
return l/p
513

614

715
def f_test():

0 commit comments

Comments
 (0)