Skip to content

Commit f0722d8

Browse files
committed
Add util/genreate.py and tests
1 parent 49101bd commit f0722d8

38 files changed

+2934
-257
lines changed
Lines changed: 120 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -1,114 +1,120 @@
1-
{
2-
"metadata": {
3-
"name": ""
4-
},
5-
"nbformat": 3,
6-
"nbformat_minor": 0,
7-
"worksheets": [
8-
{
9-
"cells": [
10-
{
11-
"cell_type": "markdown",
12-
"metadata": {},
13-
"source": [
14-
"Assume we are given SNP matrix $G \\in \\mathbb{R}^{N,D}$. Standardizing it, will set the mean $\\mu$ to zero and the variance to one for each SNP $j$.\n",
15-
"\n",
16-
"The sample variance for SNP $j$ ($\\text{var}_j$) is defined as:\n",
17-
"$$ \\text{var}_j = \\frac{1}{N} \\sum_{i=1}^N (G_{ij} - \\mu)^2 = \\frac{1}{N} \\sum_{i=1}^N G_{ij}^2 = 1 $$\n",
18-
"\n",
19-
"Thus, when computing the sum of squared entries (as in the \"new\" normalization scheme), we get:\n",
20-
"\n",
21-
"$$ ss = \\sum_{i=1}^N \\sum_{j=1}^D G_{ij}^2 = \\sum_{j=1}^D N \\cdot \\text{var}_j = N \\sum_{j=1}^D 1 = N \\cdot D $$\n",
22-
"\n",
23-
"Thus, normalizing $G$ by $\\sqrt{\\frac{ss}{N}}$ is equivalent to normalizing by $\\sqrt{D}$ if $G$ was unit standardized."
24-
]
25-
},
26-
{
27-
"cell_type": "code",
28-
"collapsed": false,
29-
"input": [
30-
"import numpy as np\n",
31-
"from pysnptools.standardizer.diag_K_to_N import DiagKtoN\n",
32-
"from pysnptools.standardizer import Unit\n",
33-
"\n",
34-
"N = 10\n",
35-
"D = 100\n",
36-
"\n",
37-
"np.random.seed(42)\n",
38-
"m = np.random.random((N,D))\n",
39-
"\n",
40-
"mu = Unit().standardize(m.copy())\n",
41-
"\n",
42-
"# get factor\n",
43-
"d2 = np.sum(mu**2) / float(N)\n",
44-
"\n",
45-
"print \"factor:\", d2, \"== D\"\n",
46-
"s = DiagKtoN(N)\n",
47-
"s.standardize(m)\n",
48-
"K = m.dot(m.T)\n",
49-
"sum_diag = np.sum(np.diag(K))\n",
50-
"\n",
51-
"print \"sum of diagonal\", sum_diag"
52-
],
53-
"language": "python",
54-
"metadata": {},
55-
"outputs": [
56-
{
57-
"output_type": "stream",
58-
"stream": "stdout",
59-
"text": [
60-
"factor: 100.0 == D\n",
61-
"sum of diagonal 10.0\n"
62-
]
63-
}
64-
],
65-
"prompt_number": 28
66-
},
67-
{
68-
"cell_type": "code",
69-
"collapsed": false,
70-
"input": [
71-
"# this may not hold true for other standardizers (e.g. beta)...\n",
72-
"\n",
73-
"import numpy as np\n",
74-
"from pysnptools.standardizer import Beta\n",
75-
"\n",
76-
"N = 10\n",
77-
"D = 100\n",
78-
"\n",
79-
"np.random.seed(42)\n",
80-
"m = np.random.random((N,D))\n",
81-
"\n",
82-
"mu = Beta().standardize(m.copy())\n",
83-
"\n",
84-
"# get factor\n",
85-
"d2 = np.sum(mu**2) / float(N)\n",
86-
"\n",
87-
"print \"factor: \", d2, \"!= D\"\n"
88-
],
89-
"language": "python",
90-
"metadata": {},
91-
"outputs": [
92-
{
93-
"output_type": "stream",
94-
"stream": "stdout",
95-
"text": [
96-
"factor: 0.0624957032658 != D\n"
97-
]
98-
}
99-
],
100-
"prompt_number": 29
101-
},
102-
{
103-
"cell_type": "code",
104-
"collapsed": false,
105-
"input": [],
106-
"language": "python",
107-
"metadata": {},
108-
"outputs": []
109-
}
110-
],
111-
"metadata": {}
112-
}
113-
]
114-
}
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"Assume we are given SNP matrix $G \\in \\mathbb{R}^{N,D}$. Standardizing it, will set the mean $\\mu$ to zero and the variance to one for each SNP $j$.\n",
8+
"\n",
9+
"The sample variance for SNP $j$ ($\\text{var}_j$) is defined as:\n",
10+
"$$ \\text{var}_j = \\frac{1}{N} \\sum_{i=1}^N (G_{ij} - \\mu)^2 = \\frac{1}{N} \\sum_{i=1}^N G_{ij}^2 = 1 $$\n",
11+
"\n",
12+
"Thus, when computing the sum of squared entries (as in the \"new\" normalization scheme), we get:\n",
13+
"\n",
14+
"$$ ss = \\sum_{i=1}^N \\sum_{j=1}^D G_{ij}^2 = \\sum_{j=1}^D N \\cdot \\text{var}_j = N \\sum_{j=1}^D 1 = N \\cdot D $$\n",
15+
"\n",
16+
"Thus, normalizing $G$ by $\\sqrt{\\frac{ss}{N}}$ is equivalent to normalizing by $\\sqrt{D}$ if $G$ was unit standardized."
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 2,
22+
"metadata": {},
23+
"outputs": [
24+
{
25+
"name": "stdout",
26+
"output_type": "stream",
27+
"text": [
28+
"factor: 100.0 == D\n",
29+
"sum of diagonal 10.000000000000002\n"
30+
]
31+
}
32+
],
33+
"source": [
34+
"import numpy as np\n",
35+
"from pysnptools.standardizer.diag_K_to_N import DiagKtoN\n",
36+
"from pysnptools.standardizer import Unit\n",
37+
"\n",
38+
"N = 10\n",
39+
"D = 100\n",
40+
"\n",
41+
"np.random.seed(42)\n",
42+
"m = np.random.random((N,D))\n",
43+
"\n",
44+
"mu = Unit().standardize(m.copy())\n",
45+
"\n",
46+
"# get factor\n",
47+
"d2 = np.sum(mu**2) / float(N)\n",
48+
"\n",
49+
"print \"factor:\", d2, \"== D\"\n",
50+
"s = DiagKtoN(N)\n",
51+
"s.standardize(m)\n",
52+
"K = m.dot(m.T)\n",
53+
"sum_diag = np.sum(np.diag(K))\n",
54+
"\n",
55+
"print \"sum of diagonal\", sum_diag"
56+
]
57+
},
58+
{
59+
"cell_type": "code",
60+
"execution_count": 1,
61+
"metadata": {},
62+
"outputs": [
63+
{
64+
"name": "stdout",
65+
"output_type": "stream",
66+
"text": [
67+
"factor: 7.8492188930247835 != D\n"
68+
]
69+
}
70+
],
71+
"source": [
72+
"# this may not hold true for other standardizers (e.g. beta)...\n",
73+
"\n",
74+
"import numpy as np\n",
75+
"from pysnptools.standardizer import Beta\n",
76+
"\n",
77+
"N = 10\n",
78+
"D = 100\n",
79+
"\n",
80+
"np.random.seed(42)\n",
81+
"m = np.random.random((N,D))\n",
82+
"\n",
83+
"mu = Beta(1,1).standardize(m.copy())\n",
84+
"\n",
85+
"# get factor\n",
86+
"d2 = np.sum(mu**2) / float(N)\n",
87+
"\n",
88+
"print \"factor: \", d2, \"!= D\"\n"
89+
]
90+
},
91+
{
92+
"cell_type": "code",
93+
"execution_count": null,
94+
"metadata": {},
95+
"outputs": [],
96+
"source": []
97+
}
98+
],
99+
"metadata": {
100+
"kernelspec": {
101+
"display_name": "Python 2",
102+
"language": "python",
103+
"name": "python2"
104+
},
105+
"language_info": {
106+
"codemirror_mode": {
107+
"name": "ipython",
108+
"version": 2
109+
},
110+
"file_extension": ".py",
111+
"mimetype": "text/x-python",
112+
"name": "python",
113+
"nbconvert_exporter": "python",
114+
"pygments_lexer": "ipython2",
115+
"version": "2.7.16"
116+
}
117+
},
118+
"nbformat": 4,
119+
"nbformat_minor": 1
120+
}

0 commit comments

Comments
 (0)