I’m trying to create a 2d array (which is a six column and lots of rows) with numpy random choice with unique values between 1 and 50 for every row not all of the array
JavaScript
x
2
1
np.sort(np.random.choice(np.arange(1,50),size=(100,6),replace=False))
2
But this raises an error.
JavaScript
1
2
1
ValueError: Cannot take a larger sample than population when 'replace=False'
2
Is it possible to make this with an one liner without a loop
Edit
Okey i get the answer.
These are the results with jupyter %time cellmagic
JavaScript
1
16
16
1
#@James' solution
2
np.stack([np.random.choice(np.arange(1,50),size=6,replace=False) for i in range(1_000_000)])
3
Wall time: 25.1 s
4
5
6
7
#@Divakar's solution
8
np.random.rand(1_000_000, 50).argpartition(6,axis=1)[:,:6]+1
9
Wall time: 1.36 s
10
11
12
13
#@CoryKramer's solution
14
np.array([np.random.choice(np.arange(1, 50), size=6, replace=False) for _ in range(1_000_000)])
15
Wall time: 25.5 s
16
I changed dtypes of np.empty and np.random.randint on @Paul Panzer’s solution because it was not working on my pc.
JavaScript
1
2
1
3.6.0 |Anaconda custom (64-bit)| (default, Dec 23 2016, 11:57:41) [MSC v.1900 64 bit (AMD64)]
2
Fastest one is
JavaScript
1
35
35
1
def pp(n):
2
draw = np.empty((n, 6), dtype=np.int64)
3
# generating random numbers is expensive, so draw a large one and
4
# make six out of one
5
draw[:, 0] = np.random.randint(0, 50*49*48*47*46*45, (n,),dtype=np.uint64)
6
draw[:, 1:] = np.arange(50, 45, -1)
7
draw = np.floor_divide.accumulate(draw, axis=-1)
8
draw[:, :-1] -= draw[:, 1:] * np.arange(50, 45, -1)
9
# map the shorter ranges (:49, :48, :47) to the non-occupied
10
# positions; this amounts to incrementing for each number on the
11
# left that is not larger. the nasty bit: if due to incrementing
12
# new numbers on the left are "overtaken" then for them we also
13
# need to increment.
14
for i in range(1, 6):
15
coll = np.sum(draw[:, :i] <= draw[:, i, None], axis=-1)
16
collidx = np.flatnonzero(coll)
17
if collidx.size == 0:
18
continue
19
coll = coll[collidx]
20
tot = coll
21
while True:
22
draw[collidx, i] += coll
23
coll = np.sum(draw[collidx, :i] <= draw[collidx, i, None], axis=-1)
24
relidx = np.flatnonzero(coll > tot)
25
if relidx.size == 0:
26
break
27
coll, tot = coll[relidx]-tot[relidx], coll[relidx]
28
collidx = collidx[relidx]
29
30
return draw + 1
31
32
#@Paul Panzer' solution
33
pp(1_000_000)
34
Wall time: 557 ms
35
Thank you all.
Advertisement
Answer
Here is a constructive approach, draw first (50 choices), second (49 choices) etc. For large sets it’s quite competitive (pp in table):
JavaScript
1
28
28
1
# n = 10
2
# pp 0.18564210 ms
3
# Divakar 0.01960790 ms
4
# James 0.20074140 ms
5
# CK 0.17823420 ms
6
# n = 1000
7
# pp 0.80046050 ms
8
# Divakar 1.31817130 ms
9
# James 18.93511460 ms
10
# CK 20.83670820 ms
11
# n = 1000000
12
# pp 655.32905590 ms
13
# Divakar 1352.44713990 ms
14
# James 18471.08987370 ms
15
# CK 18369.79808050 ms
16
# pp checking plausibility...
17
# var (exp obs) 208.333333333 208.363840259
18
# mean (exp obs) 25.5 25.5064865
19
# Divakar checking plausibility...
20
# var (exp obs) 208.333333333 208.21113972
21
# mean (exp obs) 25.5 25.499471
22
# James checking plausibility...
23
# var (exp obs) 208.333333333 208.313436938
24
# mean (exp obs) 25.5 25.4979035
25
# CK checking plausibility...
26
# var (exp obs) 208.333333333 208.169585249
27
# mean (exp obs) 25.5 25.49
28
Code including benchmarking. Algo is a bit complicated because mapping to free spots is hairy:
JavaScript
1
68
68
1
import numpy as np
2
import types
3
from timeit import timeit
4
5
def f_pp(n):
6
draw = np.empty((n, 6), dtype=int)
7
# generating random numbers is expensive, so draw a large one and
8
# make six out of one
9
draw[:, 0] = np.random.randint(0, 50*49*48*47*46*45, (n,))
10
draw[:, 1:] = np.arange(50, 45, -1)
11
draw = np.floor_divide.accumulate(draw, axis=-1)
12
draw[:, :-1] -= draw[:, 1:] * np.arange(50, 45, -1)
13
# map the shorter ranges (:49, :48, :47) to the non-occupied
14
# positions; this amounts to incrementing for each number on the
15
# left that is not larger. the nasty bit: if due to incrementing
16
# new numbers on the left are "overtaken" then for them we also
17
# need to increment.
18
for i in range(1, 6):
19
coll = np.sum(draw[:, :i] <= draw[:, i, None], axis=-1)
20
collidx = np.flatnonzero(coll)
21
if collidx.size == 0:
22
continue
23
coll = coll[collidx]
24
tot = coll
25
while True:
26
draw[collidx, i] += coll
27
coll = np.sum(draw[collidx, :i] <= draw[collidx, i, None], axis=-1)
28
relidx = np.flatnonzero(coll > tot)
29
if relidx.size == 0:
30
break
31
coll, tot = coll[relidx]-tot[relidx], coll[relidx]
32
collidx = collidx[relidx]
33
34
return draw + 1
35
36
def check_result(draw, name):
37
print(name[2:], ' checking plausibility...')
38
import scipy.stats
39
assert all(len(set(row)) == 6 for row in draw)
40
assert len(set(draw.ravel())) == 50
41
print(' var (exp obs)', scipy.stats.uniform(0.5, 50).var(), draw.var())
42
print(' mean (exp obs)', scipy.stats.uniform(0.5, 50).mean(), draw.mean())
43
44
def f_Divakar(n):
45
return np.random.rand(n, 50).argpartition(6,axis=1)[:,:6]+1
46
47
def f_James(n):
48
return np.stack([np.random.choice(np.arange(1,51),size=6,replace=False) for i in range(n)])
49
50
def f_CK(n):
51
return np.array([np.random.choice(np.arange(1, 51), size=6, replace=False) for _ in range(n)])
52
53
for n in (10, 1_000, 1_000_000):
54
print(f'n = {n}')
55
for name, func in list(globals().items()):
56
if not name.startswith('f_') or not isinstance(func, types.FunctionType):
57
continue
58
try:
59
print("{:16s}{:16.8f} ms".format(name[2:], timeit(
60
'f(n)', globals={'f':func, 'n':n}, number=10)*100))
61
except:
62
print("{:16s} apparently failed".format(name[2:]))
63
if(n >= 10000):
64
for name, func in list(globals().items()):
65
if name.startswith('f_') and isinstance(func, types.FunctionType):
66
67
check_result(func(n), name)
68