Skip to content

Commit 315b6c0

Browse files
authored
Merge pull request #143 from opensource9ja/fix/sample-bug
Fix sample bug
2 parents b20ff15 + c7a8cc2 commit 315b6c0

14 files changed

Lines changed: 991 additions & 997 deletions

File tree

danfojs-browser/lib/bundle.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

danfojs-browser/lib/bundle.js.map

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

danfojs-browser/src/core/frame.js

Lines changed: 30 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* limitations under the License.
1313
*/
1414

15-
import { tensor } from "@tensorflow/tfjs";
15+
import * as tf from "@tensorflow/tfjs";
1616
import Ndframe from "./generic";
1717
import { Series } from "./series";
1818
import { Utils } from "./utils";
@@ -183,7 +183,7 @@ export class DataFrame extends Ndframe {
183183
index: new_index
184184
});
185185
} else {
186-
this.row_data_tensor = tensor(new_data);
186+
this.row_data_tensor = tf.tensor(new_data);
187187
this.data = new_data;
188188
this.__set_index(new_index);
189189
}
@@ -266,36 +266,24 @@ export class DataFrame extends Ndframe {
266266

267267
/**
268268
* Gets [num] number of random rows in a dataframe
269-
* @param {rows} rows --> int
270-
* @returns DataFrame
269+
* @param {num} rows --> The number of rows to return
270+
* @param {seed} seed --> (Optional) An integer specifying the random seed that will be used to create the distribution.
271+
* @returns {Promise} resolves to a DataFrame object
271272
*/
272-
sample(num = 5) {
273-
//TODO: Use different sampling strategy
274-
if (num > this.values.length || num < 1) {
275-
//return all values
276-
let config = { columns: this.column_names };
277-
return new DataFrame(this.values, config);
278-
} else {
279-
let values = this.values;
280-
let idx = this.index;
281-
let new_values = [];
282-
let new_idx = [];
283-
284-
let counts = [ ...Array(idx.length).keys() ]; //set index
285-
286-
//get random sampled numbers
287-
let rand_nums = utils.__sample_from_iter(counts, num, false);
288-
rand_nums.map((i) => {
289-
new_values.push(values[i]);
290-
new_idx.push(idx[i]);
291-
});
292-
293-
let config = { columns: this.column_names, index: new_idx };
294-
let df = new DataFrame(new_values, config);
295-
return df;
273+
async sample(num = -1, seed = 1) {
274+
if (num > this.shape[0]) {
275+
throw new Error("Sample size n cannot be bigger than size of dataset");
296276
}
277+
if (num < -1 || num == 0) {
278+
throw new Error("Sample size cannot be less than -1 or 0");
279+
}
280+
num = num === -1 ? this.shape[0] : num;
281+
const shuffled_index = await tf.data.array(this.index).shuffle(num, seed).take(num).toArray();
282+
const df = this.iloc({ rows: shuffled_index });
283+
return df;
297284
}
298285

286+
299287
/**
300288
* Return Addition of DataFrame and other, element-wise (binary operator add).
301289
* @param {other} DataFrame, Series, Array or Number to add
@@ -916,7 +904,7 @@ export class DataFrame extends Ndframe {
916904
}
917905

918906
values.map((arr) => {
919-
let temp_sum = tensor(arr).sum().arraySync();
907+
let temp_sum = tf.tensor(arr).sum().arraySync();
920908
val_sums.push(Number(temp_sum.toFixed(5)));
921909
});
922910

@@ -940,7 +928,7 @@ export class DataFrame extends Ndframe {
940928
abs() {
941929
let data = this.values;
942930

943-
let tensor_data = tensor(data);
931+
let tensor_data = tf.tensor(data);
944932
let abs_data = tensor_data.abs().arraySync();
945933
let df = new DataFrame(utils.__round(abs_data, 2, false), {
946934
columns: this.column_names,
@@ -1406,7 +1394,7 @@ export class DataFrame extends Ndframe {
14061394
}
14071395

14081396
for (let i = 0; i < df_data.length; i++) {
1409-
let value = tensor(df_data[i]);
1397+
let value = tf.tensor(df_data[i]);
14101398
let callable_data;
14111399
try {
14121400
callable_data = callable(value).arraySync();
@@ -1659,18 +1647,18 @@ export class DataFrame extends Ndframe {
16591647
`Shape Error: Operands could not be broadcast together with shapes ${this.shape} and ${val.values.length}.`
16601648
);
16611649
}
1662-
other = tensor(val.values);
1650+
other = tf.tensor(val.values);
16631651
} else {
16641652
if (val.values.length != this.shape[1]) {
16651653
throw Error(
16661654
`Shape Error: Operands could not be broadcast together with shapes ${this.shape} and ${val.values.length}.`
16671655
);
16681656
}
1669-
other = tensor(val.values);
1657+
other = tf.tensor(val.values);
16701658
}
16711659
} else if (Array.isArray(val)) {
16721660
//Array of Array
1673-
other = tensor(val);
1661+
other = tf.tensor(val);
16741662
} else {
16751663
//DataFrame
16761664
other = val.row_data_tensor;
@@ -1679,22 +1667,22 @@ export class DataFrame extends Ndframe {
16791667

16801668
switch (logical_type) {
16811669
case "lt":
1682-
int_vals = tensor(this.values).less(other).arraySync();
1670+
int_vals = tf.tensor(this.values).less(other).arraySync();
16831671
break;
16841672
case "gt":
1685-
int_vals = tensor(this.values).greater(other).arraySync();
1673+
int_vals = tf.tensor(this.values).greater(other).arraySync();
16861674
break;
16871675
case "le":
1688-
int_vals = tensor(this.values).lessEqual(other).arraySync();
1676+
int_vals = tf.tensor(this.values).lessEqual(other).arraySync();
16891677
break;
16901678
case "ge":
1691-
int_vals = tensor(this.values).greaterEqual(other).arraySync();
1679+
int_vals = tf.tensor(this.values).greaterEqual(other).arraySync();
16921680
break;
16931681
case "ne":
1694-
int_vals = tensor(this.values).notEqual(other).arraySync();
1682+
int_vals = tf.tensor(this.values).notEqual(other).arraySync();
16951683
break;
16961684
case "eq":
1697-
int_vals = tensor(this.values).equal(other).arraySync();
1685+
int_vals = tf.tensor(this.values).equal(other).arraySync();
16981686
break;
16991687
}
17001688
let bool_vals = utils.__map_int_to_bool(int_vals, 2);
@@ -1754,7 +1742,7 @@ export class DataFrame extends Ndframe {
17541742

17551743
this_tensor = tensors[0].row_data_tensor; //tensorflow uses 1 for rows axis and 0 for column axis
17561744
if (tensors[1].series) {
1757-
other_tensor = tensor(tensors[1].values, [
1745+
other_tensor = tf.tensor(tensors[1].values, [
17581746
1,
17591747
tensors[1].values.length
17601748
]);
@@ -1771,7 +1759,7 @@ export class DataFrame extends Ndframe {
17711759

17721760
this_tensor = tensors[0].row_data_tensor;
17731761
if (tensors[1].series) {
1774-
other_tensor = tensor(tensors[1].values, [
1762+
other_tensor = tf.tensor(tensors[1].values, [
17751763
tensors[1].values.length,
17761764
1
17771765
]);

danfojs-browser/src/core/series.js

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
*/
1515

1616

17-
import { tensor, round } from "@tensorflow/tfjs";
17+
import * as tf from "@tensorflow/tfjs";
1818
import { variance, std } from 'mathjs';
1919
import { Utils } from "./utils";
2020
import { Str } from "./strings";
@@ -55,7 +55,7 @@ export class Series extends NDframe {
5555
* @returns {1D Tensor}
5656
*/
5757
get tensor() {
58-
return tensor(this.values).asType(this.dtypes[0]);
58+
return tf.tensor(this.values).asType(this.dtypes[0]);
5959
}
6060

6161

@@ -95,30 +95,22 @@ export class Series extends NDframe {
9595
}
9696

9797
/**
98-
* Returns n number of random rows in a Series
99-
* @param {rows} number of rows to return
100-
* @returns {Series}
98+
* Gets [num] number of random rows in a dataframe
99+
* @param {num} rows --> The number of rows to return
100+
* @param {seed} seed --> (Optional) An integer specifying the random seed that will be used to create the distribution.
101+
* @returns {Promise} resolves to a Series object
101102
*/
102-
sample(num = 5) {
103-
if (num > this.values.length || num < 1) {
104-
let config = { columns: this.column_names };
105-
return new Series(this.values, config);
106-
} else {
107-
let values = this.values;
108-
let idx = this.index;
109-
let new_values = [];
110-
let new_idx = [];
111-
let rand_nums = utils.__shuffle(num, idx);
112-
113-
rand_nums.forEach((i) => {
114-
new_values.push(values[i]);
115-
new_idx.push(idx[i]);
116-
});
117-
let config = { columns: this.column_names, index: new_idx };
118-
let sf = new Series(new_values, config);
119-
return sf;
120-
103+
async sample(num = 5, seed = 1) {
104+
if (num > this.shape[0]) {
105+
throw new Error("Sample size n cannot be bigger than size of dataset");
121106
}
107+
if (num < -1 || num == 0) {
108+
throw new Error("Sample size cannot be less than -1 or 0");
109+
}
110+
num = num === -1 ? this.shape[0] : num;
111+
const shuffled_index = await tf.data.array(this.index).shuffle(num, seed).take(num).toArray();
112+
const sf = this.iloc(shuffled_index);
113+
return sf;
122114
}
123115

124116
/**
@@ -250,7 +242,7 @@ export class Series extends NDframe {
250242
mean() {
251243
utils._throw_str_dtype_error(this, 'mean');
252244
let values = utils._remove_nans(this.values);
253-
let mean = tensor(values).mean().arraySync();
245+
let mean = tf.tensor(values).mean().arraySync();
254246
return mean;
255247
}
256248

@@ -382,7 +374,7 @@ export class Series extends NDframe {
382374
round(dp) {
383375
if (utils.__is_undefined(dp)) {
384376
//use tensorflow round function to roound to the nearest whole number
385-
let result = round(this.row_data_tensor).arraySync();
377+
let result = tf.round(this.row_data_tensor).arraySync();
386378
return new Series(result, { columns: this.column_names, index: this.index });
387379

388380
} else {

danfojs-browser/tests/core/frame.js

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
11
/* eslint-disable no-undef */
2-
const fs = require("fs");
3-
4-
const testCSVPath = "./tester.csv";
52

63
describe("DataFrame", function () {
74

@@ -164,29 +161,52 @@ describe("DataFrame", function () {
164161
});
165162

166163
describe("sample", function () {
167-
it("Samples n number of random elements from a DataFrame", function () {
164+
it("Samples n number of random elements from a DataFrame", async function () {
168165
let data = [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 20, 30, 40 ], [ 39, 89, 78 ], [ 100, 200, 300 ] ];
169166
let cols = [ "A", "B", "C" ];
170167
let df = new dfd.DataFrame(data, { columns: cols });
171-
assert.deepEqual(df.sample(2).shape, [ 2, 3 ]);
168+
let expected = [ [ 1, 2, 3 ], [ 20, 30, 40 ] ];
169+
let values = (await df.sample(2)).values;
170+
assert.deepEqual(values, expected);
172171
});
173-
it("Samples n number of random elements from a DataFrame", function () {
172+
it("Throw error if n is greater than lenght of Dataframe", async function () {
174173
let data = [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 20, 30, 40 ], [ 39, 89, 78 ], [ 100, 200, 300 ] ];
175174
let cols = [ "A", "B", "C" ];
176175
let df = new dfd.DataFrame(data, { columns: cols });
177-
assert.deepEqual(df.sample().shape, [ 5, 3 ]);
178-
});
179-
it("Return all values if n of sample is greater than lenght of Dataframe", function () {
176+
try {
177+
await df.sample(100);
178+
} catch (e) {
179+
expect(e).to.be.instanceOf(Error);
180+
expect(e.message).to.eql('Sample size n cannot be bigger than size of dataset');
181+
}
182+
});
183+
it("Throw error if n is less than -1", async function () {
180184
let data = [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 20, 30, 40 ], [ 39, 89, 78 ], [ 100, 200, 300 ] ];
181185
let cols = [ "A", "B", "C" ];
182186
let df = new dfd.DataFrame(data, { columns: cols });
183-
assert.deepEqual(df.sample(6).shape, [ 5, 3 ]);
184-
});
185-
it("Return all values if n of sample is less than 1", function () {
187+
try {
188+
await df.sample(-2);
189+
} catch (e) {
190+
expect(e).to.be.instanceOf(Error);
191+
expect(e.message).to.eql('Sample size cannot be less than -1 or 0');
192+
}
193+
});
194+
it("Throw error if n is 0", async function () {
195+
let data = [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 20, 30, 40 ], [ 39, 89, 78 ], [ 100, 200, 300 ] ];
196+
let cols = [ "A", "B", "C" ];
197+
let df = new dfd.DataFrame(data, { columns: cols });
198+
try {
199+
await df.sample(0);
200+
} catch (e) {
201+
expect(e).to.be.instanceOf(Error);
202+
expect(e.message).to.eql('Sample size cannot be less than -1 or 0');
203+
}
204+
});
205+
it("Return all values if n is -1", async function () {
186206
let data = [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 20, 30, 40 ], [ 39, 89, 78 ], [ 100, 200, 300 ] ];
187207
let cols = [ "A", "B", "C" ];
188208
let df = new dfd.DataFrame(data, { columns: cols });
189-
assert.deepEqual(df.sample(-1).shape, [ 5, 3 ]);
209+
assert.deepEqual((await df.sample(-1)).shape, [ 5, 3 ]);
190210
});
191211
});
192212

@@ -820,7 +840,7 @@ describe("DataFrame", function () {
820840
let expected = [ [ 2, 4, 6, 'c' ], [ 360, 180, 1, 'b' ], [ 0, 2, 4, 'a' ] ];
821841
assert.deepEqual(df.sort_values({ "by": "col4", "ascending": false }).values, expected);
822842
});
823-
it("Sort duplicate DataGrame with duplicate columns", function(){
843+
it("Sort duplicate DataGrame with duplicate columns", function () {
824844

825845
let data = {
826846
"A": [ 1, 2, 3, 4, 5, 3, 5, 6, 4, 5, 3, 4 ],
@@ -1025,10 +1045,12 @@ describe("DataFrame", function () {
10251045
df.query({ "column": "B", "is": ">=", "to": 5, inplace: true });
10261046
assert.deepEqual(df.index, [ 1, 2, 3 ]);
10271047
});
1028-
it("Wrong query value", function(){
1029-
let data = { "A": [ 30, 1, 2, 3 ],
1048+
it("Wrong query value", function () {
1049+
let data = {
1050+
"A": [ 30, 1, 2, 3 ],
10301051
"B": [ 34, 4, 5, 6 ],
1031-
"C": [ 20, 20, 30, 40 ] };
1052+
"C": [ 20, 20, 30, 40 ]
1053+
};
10321054

10331055
let cols = [ "A", "B", "C" ];
10341056
let df = new dfd.DataFrame(data, { columns: cols });

danfojs-browser/tests/core/series.js

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/* eslint-disable no-undef */
2-
const tf = require("@tensorflow/tfjs-core");
2+
const tf = require("@tensorflow/tfjs");
33

44
describe("Series", function () {
55
describe("tensor", function () {
@@ -78,20 +78,25 @@ describe("Series", function () {
7878
});
7979

8080
describe("sample", function () {
81-
it("Samples n number of random elements from a DataFrame", function () {
81+
it("Samples n number of random elements from a DataFrame", async function () {
8282
let data = [ 1, 2, 3, 4, 5, 620, 30, 40, 39, 89, 78 ];
8383
let sf = new dfd.Series(data);
84-
assert.deepEqual(sf.sample(7).values.length, 7);
84+
assert.deepEqual((await sf.sample(7)).values.length, 7);
8585
});
86-
it("Return all values if n of sample is greater than lenght of Dataframe", function () {
86+
it("Return all values if n of sample -1", async function () {
8787
let data = [ 1, 2, 3, 4, 5, 620, 30, 40, 39, 89, 78 ];
8888
let sf = new dfd.Series(data);
89-
assert.deepEqual(sf.sample(21).values.length, data.length);
89+
assert.deepEqual((await sf.sample(-1)).values.length, data.length);
9090
});
91-
it("Return all values if n of sample is less than 1", function () {
91+
it("Throw error if n is greater than lenght of Series", async function () {
9292
let data = [ 1, 2, 3, 4, 5, 620, 30, 40, 39, 89, 78 ];
9393
let sf = new dfd.Series(data);
94-
assert.deepEqual(sf.sample(-2).values.length, data.length);
94+
try {
95+
await sf.sample(100);
96+
} catch (e) {
97+
expect(e).to.be.instanceOf(Error);
98+
expect(e.message).to.eql('Sample size n cannot be bigger than size of dataset');
99+
}
95100
});
96101
});
97102

danfojs-node/.eslintrc.json

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@
3131
{ "before": true
3232
}
3333
],
34-
"array-bracket-spacing": [ "error", "always"
35-
],
3634
"space-infix-ops": "error",
3735
"object-curly-spacing": [ "error", "always"
3836
],

0 commit comments

Comments
 (0)