Skip to content

Commit 3ddc08c

Browse files
Merge pull request #1 from LCSB-BioCore/mk
Start the package
2 parents 4338d8f + 5af3606 commit 3ddc08c

9 files changed

Lines changed: 460 additions & 4 deletions

File tree

.github/workflows/ci.yml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches:
6+
- master
7+
8+
jobs:
9+
test:
10+
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
11+
runs-on: ${{ matrix.os }}
12+
strategy:
13+
fail-fast: false
14+
matrix:
15+
version:
16+
- 'nightly'
17+
os:
18+
- ubuntu-latest
19+
- macOS-latest
20+
- windows-latest
21+
arch:
22+
- x64
23+
steps:
24+
- uses: actions/checkout@v2
25+
- uses: julia-actions/setup-julia@v1
26+
with:
27+
version: ${{ matrix.version }}
28+
arch: ${{ matrix.arch }}
29+
- uses: actions/cache@v1
30+
env:
31+
cache-name: cache-artifacts
32+
with:
33+
path: ~/.julia/artifacts
34+
key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
35+
restore-keys: |
36+
${{ runner.os }}-test-${{ env.cache-name }}-
37+
${{ runner.os }}-test-
38+
${{ runner.os }}-
39+
- uses: julia-actions/julia-buildpkg@latest
40+
- run: |
41+
git config --global user.name Tester
42+
git config --global user.email te@st.er
43+
- uses: julia-actions/julia-runtest@latest
44+
continue-on-error: ${{ matrix.version == 'nightly' }}
45+
- uses: julia-actions/julia-processcoverage@v1
46+
- uses: codecov/codecov-action@v1
47+
with:
48+
file: lcov.info

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,6 @@ docs/site/
2222
# committed for packages, but should be committed for applications that require a static
2323
# environment.
2424
Manifest.toml
25+
26+
#vim
27+
.*.swp

Project.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
name = "DiDa"
2-
uuid = "562f8d76-d205-4967-b33e-8d07f0059ed7"
2+
uuid = "f6a0035f-c5ac-4ad0-b410-ad102ced35df"
33
authors = ["Mirek Kratochvil <exa.exa@gmail.com>"]
44
version = "0.1.0"
5+
6+
[deps]
7+
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
8+
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
9+
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
10+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,9 @@
1-
# DiDa.jl
1+
# DiDa.jl
2+
3+
Simple Distributed Data manipulation and processing routines for Julia.
4+
5+
This was originally developed for
6+
[GigaSOM.jl](https://github.com/LCSB-BioCore/GigaSOM.jl), this package contains
7+
the separated-out lightweight distributed-processing framework that can be used
8+
with GigaSOM.
9+

src/DiDa.jl

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,43 @@
1-
module DD
1+
module DiDa
22

3-
greet() = print("Hello World!")
3+
using Distributed
4+
using Serialization
5+
6+
include("structs.jl")
7+
export Dinfo
8+
9+
include("base.jl")
10+
export save_at,
11+
get_from,
12+
get_val_from,
13+
remove_from,
14+
distribute_array,
15+
distribute_darray,
16+
undistribute,
17+
distributed_transform,
18+
distributed_mapreduce,
19+
distributed_foreach,
20+
distributed_collect
21+
22+
include("io.jl")
23+
export distributed_export,
24+
distributed_import,
25+
distributed_unlink
26+
27+
include("tools.jl")
28+
export dcopy,
29+
dselect,
30+
dapply_cols,
31+
dapply_rows,
32+
dstat,
33+
dstat_buckets,
34+
dcount,
35+
dcount_buckets,
36+
dscale,
37+
dtransform_asinh,
38+
dmedian,
39+
dmedian_buckets,
40+
mapbuckets,
41+
catmapbuckets
442

543
end # module

src/io.jl

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
2+
"""
3+
defaultFiles(s, pids)
4+
5+
Make a good set of filenames for saving a dataset.
6+
"""
7+
function defaultFiles(s, pids)
8+
return [String(s) * "-$i.slice" for i in eachindex(pids)]
9+
end
10+
11+
"""
12+
distributed_export(sym::Symbol, pids, files=defaultFiles(sym,pids))
13+
14+
Export the content of symbol `sym` by each worker specified by `pids` to a
15+
corresponding filename in `files`.
16+
"""
17+
function distributed_export(sym::Symbol, pids, files = defaultFiles(sym, pids))
18+
distributed_foreach(
19+
files,
20+
(fn) -> Base.eval(Main, :(
21+
begin
22+
open(f -> $serialize(f, $sym), $fn, "w")
23+
nothing
24+
end
25+
)),
26+
pids,
27+
)
28+
nothing
29+
end
30+
31+
"""
32+
distributed_export(dInfo::Dinfo, files=defaultFiles(dInfo.val, dInfo.workers))
33+
34+
Overloaded functionality for `Dinfo`.
35+
"""
36+
function distributed_export(
37+
dInfo::Dinfo,
38+
files = defaultFiles(dInfo.val, dInfo.workers),
39+
)
40+
distributed_export(dInfo.val, dInfo.workers, files)
41+
end
42+
43+
"""
44+
distributed_import(sym::Symbol, pids, files=defaultFiles(sym,pids))
45+
46+
Import the content of symbol `sym` by each worker specified by `pids` from the
47+
corresponding filename in `files`.
48+
"""
49+
function distributed_import(sym::Symbol, pids, files = defaultFiles(sym, pids))
50+
distributed_foreach(
51+
files,
52+
(fn) -> Base.eval(Main, :(
53+
begin
54+
$sym = open($deserialize, $fn)
55+
nothing
56+
end
57+
)),
58+
pids,
59+
)
60+
return Dinfo(sym, pids)
61+
end
62+
63+
"""
64+
distributed_import(dInfo::Dinfo, files=defaultFiles(dInfo.val, dInfo.workers))
65+
66+
Overloaded functionality for `Dinfo`.
67+
"""
68+
function distributed_import(
69+
dInfo::Dinfo,
70+
files = defaultFiles(dInfo.val, dInfo.workers),
71+
)
72+
distributed_import(dInfo.val, dInfo.workers, files)
73+
end
74+
75+
"""
76+
distributed_unlink(sym::Symbol, pids, files=defaultFiles(sym,pids))
77+
78+
Remove the files created by `distributed_export` with the same parameters.
79+
"""
80+
function distributed_unlink(sym::Symbol, pids, files = defaultFiles(sym, pids))
81+
distributed_foreach(files, (fn) -> rm(fn), pids)
82+
nothing
83+
end
84+
85+
"""
86+
distributed_unlink(dInfo::Dinfo, files=defaultFiles(dInfo.val, dInfo.workers))
87+
88+
Overloaded functionality for `Dinfo`.
89+
"""
90+
function distributed_unlink(
91+
dInfo::Dinfo,
92+
files = defaultFiles(dInfo.val, dInfo.workers),
93+
)
94+
distributed_unlink(dInfo.val, dInfo.workers, files)
95+
end

test/base.jl

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
2+
@testset "Base functions" begin
3+
4+
@testset "Distributed data transfers -- local" begin
5+
data = rand(5)
6+
save_at(1, :test, data)
7+
8+
@test fetch(get_from(1, :test)) == data
9+
@test get_val_from(1, :test) == data
10+
11+
remove_from(1, :test)
12+
13+
@test sizeof(get_val_from(1, :test)) == 0 #should be "nothing" but this is more generic
14+
end
15+
16+
addprocs(3)
17+
@everywhere using DiDa
18+
W = workers()
19+
20+
@testset "Distributed data transfers -- with workers" begin
21+
data = [rand(5) for i in W]
22+
for (i, w) in enumerate(W)
23+
save_at(w, :test, data[i])
24+
end
25+
26+
@test [fetch(get_from(w, :test)) for w in W] == data
27+
@test [get_val_from(w, :test) for w in W] == data
28+
29+
undistribute(:test, W)
30+
31+
@test sum([sizeof(get_val_from(w, :test)) for w in W]) == 0
32+
end
33+
34+
@testset "Data distribution" begin
35+
d = rand(100, 5)
36+
37+
di = distribute_array(:test, d, W)
38+
39+
@test di.val == :test
40+
@test Set(di.workers) == Set(W)
41+
@test begin
42+
d1 = get_val_from(di.workers[1], :test)
43+
d1 == d[1:size(d1, 1), :]
44+
end
45+
46+
#TODO test actual sizes of the distributed pieces
47+
48+
@test distributed_collect(di, free = false) == d #TODO test with true
49+
@test sum([sizeof(get_val_from(w, :test)) for w in W]) > 0
50+
undistribute(di)
51+
@test sum([sizeof(get_val_from(w, :test)) for w in W]) == 0
52+
end
53+
54+
@testset "Distributed computation" begin
55+
di = distributed_transform(:(), x -> rand(5), W, :test)
56+
57+
@test get_val_from(W[1], :test) == distributed_collect(di)[1:5]
58+
59+
orig = distributed_collect(di)
60+
61+
@test isapprox(
62+
distributed_mapreduce(:test, d -> sum(d .^ 2), (a, b) -> a + b, W),
63+
sum(orig .^ 2),
64+
)
65+
66+
distributed_transform(di, d -> d .* 2)
67+
68+
@test orig .* 2 == distributed_collect(:test, W)
69+
70+
@test isapprox(
71+
distributed_mapreduce(di, d -> sum(d .^ 2), (a, b) -> a + b),
72+
sum((orig .* 2) .^ 2),
73+
)
74+
75+
t = zeros(length(W))
76+
exp = zeros(length(W))
77+
78+
t[1] = 2
79+
exp[1] = sum(2 .* get_val_from(W[1], :test))
80+
81+
@test distributed_foreach(t, (i) -> eval(:(sum($i .* $(di.val)))), W) == exp
82+
83+
undistribute(di)
84+
85+
@test distributed_mapreduce(:noname, x -> x, (a, b) -> a + b, []) == nothing
86+
end
87+
88+
@testset "Internal utilities" begin
89+
@test DiDa.tmpSym(:test) != :test
90+
@test DiDa.tmpSym(:test, prefix = "abc", suffix = "def") == :abctestdef
91+
@test DiDa.tmpSym(Dinfo(:test, W)) != :test
92+
end
93+
94+
@testset "Persistent distributed data" begin
95+
di = distributed_transform(:(), x -> rand(5), W, :test)
96+
97+
files = DiDa.defaultFiles(di.val, di.workers)
98+
@test allunique(files)
99+
100+
orig = distributed_collect(di)
101+
distributed_export(di, files)
102+
distributed_transform(di, x -> "erased")
103+
distributed_import(di, files)
104+
105+
@test orig == distributed_collect(di)
106+
107+
distributed_export(di.val, di.workers, files)
108+
di2 = distributed_import(:test2, di.workers, files)
109+
110+
@test orig == distributed_collect(di2)
111+
112+
undistribute(di)
113+
undistribute(di2)
114+
115+
distributed_unlink(di)
116+
117+
@test all([!isfile(f) for f in files])
118+
end
119+
120+
rmprocs(W)
121+
W = nothing
122+
123+
end

test/runtests.jl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
using Test
3+
using DiDa, Distributed, Random
4+
5+
@testset "DiDa tests" begin
6+
include("base.jl")
7+
include("tools.jl")
8+
end

0 commit comments

Comments
 (0)