-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataset.py
More file actions
52 lines (43 loc) · 1.5 KB
/
dataset.py
File metadata and controls
52 lines (43 loc) · 1.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import torch
from torch.utils.data import IterableDataset
from .pipeline import Pipeline
class SparkIterableDataset(IterableDataset):
def __init__(
self,
spark_pipeline: Pipeline,
to_tensor: bool = True,
infinite: bool = False,
):
super().__init__()
self.spark_pipeline = spark_pipeline
self.to_tensor = to_tensor
self.infinite = infinite
def _convert(self, item):
"""Convert Spark output item to torch.Tensor/dict-of-tensors if needed."""
if not self.to_tensor:
return item
if isinstance(item, torch.Tensor):
return item
if isinstance(item, dict):
out = {}
for k, v in item.items():
if isinstance(v, torch.Tensor):
out[k] = v
else:
out[k] = torch.tensor(v, dtype=torch.float32)
return out
if isinstance(item, (list, tuple)):
return torch.tensor(item, dtype=torch.float32)
if isinstance(item, (int, float)):
return torch.tensor([item], dtype=torch.float32)
return item
def __iter__(self):
if self.infinite:
while True:
rdd = self.spark_pipeline.run()
for item in rdd.toLocalIterator():
yield self._convert(item)
else:
rdd = self.spark_pipeline.run()
for item in rdd.toLocalIterator():
yield self._convert(item)