Skip to content

Commit d510e77

Browse files
authored
Merge pull request #57 from OpenMined/47_syft_partition_dataset
Create syft-compatible partition dataset
2 parents 3d1178c + 35fbc39 commit d510e77

11 files changed

Lines changed: 745 additions & 4 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,6 @@ cython_debug/<Paste>
153153
MNIST/
154154
images/*
155155
!images/diagram_white_background.png
156+
157+
examples/experimental/*
158+
!examples/experimental/*.ipynb

examples/PyVertical Example.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,8 @@
126126
"import syft as sy\n",
127127
"\n",
128128
"from src.dataloader import VerticalDataLoader\n",
129-
"from src.dataset import add_ids\n",
130129
"from src.psi.util import compute_psi\n",
130+
"from src.utils import add_ids\n",
131131
"\n",
132132
"hook = sy.TorchHook(torch)"
133133
]
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stderr",
10+
"output_type": "stream",
11+
"text": [
12+
"Falling back to insecure randomness since the required custom op could not be found for the installed version of TensorFlow. Fix this by compiling custom ops. Missing file was '/home/tom/anaconda3/envs/pyvertical-dev/lib/python3.7/site-packages/tf_encrypted/operations/secure_random/secure_random_module_tf_1.15.3.so'\n"
13+
]
14+
},
15+
{
16+
"name": "stdout",
17+
"output_type": "stream",
18+
"text": [
19+
"WARNING:tensorflow:From /home/tom/anaconda3/envs/pyvertical-dev/lib/python3.7/site-packages/tf_encrypted/session.py:24: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.\n",
20+
"\n"
21+
]
22+
}
23+
],
24+
"source": [
25+
"import os\n",
26+
"import sys\n",
27+
"sys.path.append(\"..\" + os.sep + \"..\")\n",
28+
"\n",
29+
"import torch\n",
30+
"import syft as sy\n",
31+
"\n",
32+
"from src.future import PartitionedDataset, VerticalDataset"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": 2,
38+
"metadata": {},
39+
"outputs": [],
40+
"source": [
41+
"hook = sy.TorchHook(torch)"
42+
]
43+
},
44+
{
45+
"cell_type": "markdown",
46+
"metadata": {},
47+
"source": [
48+
"We will now turn this dataset into a PartitionedDataset. PartitionedDatsets can hold data, targets or both."
49+
]
50+
},
51+
{
52+
"cell_type": "code",
53+
"execution_count": 3,
54+
"metadata": {},
55+
"outputs": [],
56+
"source": [
57+
"data = torch.tensor([1.0, 2.0, 3.0]).tag(\"#toy\").describe(\"Toy input data.\")\n",
58+
"targets = torch.tensor([0, 1, 1]).tag(\"#toy\").describe(\"Toy data labels.\")\n",
59+
"dataset = PartitionedDataset(data, targets)"
60+
]
61+
},
62+
{
63+
"cell_type": "markdown",
64+
"metadata": {},
65+
"source": [
66+
"Just one dataset isn't very exciting - the data is not vertically partitioned! PartitionedDatasets come with a helper method to vertically partition a dataset.\n",
67+
"We will move the data onto virtual workers."
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": 4,
73+
"metadata": {},
74+
"outputs": [],
75+
"source": [
76+
"alice = sy.VirtualWorker(id=\"alice\", hook=hook, is_client_worker=False)\n",
77+
"bob = sy.VirtualWorker(id=\"bob\", hook=hook, is_client_worker=False)"
78+
]
79+
},
80+
{
81+
"cell_type": "code",
82+
"execution_count": 5,
83+
"metadata": {},
84+
"outputs": [],
85+
"source": [
86+
"vertical_data = dataset.vertically_federate((alice, bob))"
87+
]
88+
},
89+
{
90+
"cell_type": "code",
91+
"execution_count": 6,
92+
"metadata": {},
93+
"outputs": [
94+
{
95+
"data": {
96+
"text/plain": [
97+
"src.future.dataset.VerticalDataset"
98+
]
99+
},
100+
"execution_count": 6,
101+
"metadata": {},
102+
"output_type": "execute_result"
103+
}
104+
],
105+
"source": [
106+
"type(vertical_data)"
107+
]
108+
},
109+
{
110+
"cell_type": "markdown",
111+
"metadata": {},
112+
"source": [
113+
"This new dataset is a VerticalDataset. This is similar to syft's FederatedDataset - it holds a list of vertically partitioned dataset assigned to different workers."
114+
]
115+
},
116+
{
117+
"cell_type": "code",
118+
"execution_count": 7,
119+
"metadata": {},
120+
"outputs": [
121+
{
122+
"name": "stdout",
123+
"output_type": "stream",
124+
"text": [
125+
"Toy input data.\n"
126+
]
127+
}
128+
],
129+
"source": [
130+
"alice_results = alice.search([\"#toy\"])\n",
131+
"for res in alice_results:\n",
132+
" print(res.description)"
133+
]
134+
},
135+
{
136+
"cell_type": "code",
137+
"execution_count": 8,
138+
"metadata": {},
139+
"outputs": [
140+
{
141+
"name": "stdout",
142+
"output_type": "stream",
143+
"text": [
144+
"Toy data labels.\n"
145+
]
146+
}
147+
],
148+
"source": [
149+
"bob_results = bob.search([\"#toy\"])\n",
150+
"for res in bob_results:\n",
151+
" print(res.description)"
152+
]
153+
},
154+
{
155+
"cell_type": "markdown",
156+
"metadata": {},
157+
"source": [
158+
"You can see that Alice has the data and Bob has the labels."
159+
]
160+
},
161+
{
162+
"cell_type": "code",
163+
"execution_count": 9,
164+
"metadata": {},
165+
"outputs": [
166+
{
167+
"data": {
168+
"text/plain": [
169+
"['alice', 'bob']"
170+
]
171+
},
172+
"execution_count": 9,
173+
"metadata": {},
174+
"output_type": "execute_result"
175+
}
176+
],
177+
"source": [
178+
"vertical_data.workers"
179+
]
180+
},
181+
{
182+
"cell_type": "markdown",
183+
"metadata": {},
184+
"source": [
185+
"You can collect a dataset from its remote."
186+
]
187+
},
188+
{
189+
"cell_type": "code",
190+
"execution_count": 10,
191+
"metadata": {},
192+
"outputs": [],
193+
"source": [
194+
"alices_dataset = vertical_data.get_dataset(\"alice\")"
195+
]
196+
},
197+
{
198+
"cell_type": "code",
199+
"execution_count": 11,
200+
"metadata": {},
201+
"outputs": [
202+
{
203+
"data": {
204+
"text/plain": [
205+
"PartitionedDataset\n",
206+
"\tData: tensor([1., 2., 3.])"
207+
]
208+
},
209+
"execution_count": 11,
210+
"metadata": {},
211+
"output_type": "execute_result"
212+
}
213+
],
214+
"source": [
215+
"alices_dataset"
216+
]
217+
},
218+
{
219+
"cell_type": "code",
220+
"execution_count": 12,
221+
"metadata": {},
222+
"outputs": [
223+
{
224+
"data": {
225+
"text/plain": [
226+
"['bob']"
227+
]
228+
},
229+
"execution_count": 12,
230+
"metadata": {},
231+
"output_type": "execute_result"
232+
}
233+
],
234+
"source": [
235+
"vertical_data.workers"
236+
]
237+
},
238+
{
239+
"cell_type": "markdown",
240+
"metadata": {},
241+
"source": [
242+
"After which the VerticalDataset only contains Bob's labels."
243+
]
244+
}
245+
],
246+
"metadata": {
247+
"kernelspec": {
248+
"display_name": "Python 3",
249+
"language": "python",
250+
"name": "python3"
251+
},
252+
"language_info": {
253+
"codemirror_mode": {
254+
"name": "ipython",
255+
"version": 3
256+
},
257+
"file_extension": ".py",
258+
"mimetype": "text/x-python",
259+
"name": "python",
260+
"nbconvert_exporter": "python",
261+
"pygments_lexer": "ipython3",
262+
"version": "3.7.6"
263+
}
264+
},
265+
"nbformat": 4,
266+
"nbformat_minor": 4
267+
}

src/dataloader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from torch.utils.data import DataLoader
1010
from torch.utils.data._utils.collate import default_collate
11-
from src.dataset import partition_dataset
11+
from src.utils import partition_dataset
1212

1313

1414
def id_collate_fn(batch: Tuple) -> List:

src/future/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .dataset import PartitionedDataset, VerticalDataset

0 commit comments

Comments
 (0)