I’m trying to create a 2d array (which is a six column and lots of rows) with numpy random choice with unique values between 1 and 50 for every row not all of the array
np.sort(np.random.choice(np.arange(1,50),size=(100,6),replace=False))
But this raises an error.
ValueError: Cannot take a larger sample than population when 'replace=False'
Is it possible to make this with an one liner without a loop
Edit
Okey i get the answer.
These are the results with jupyter %time cellmagic
#@James' solution np.stack([np.random.choice(np.arange(1,50),size=6,replace=False) for i in range(1_000_000)]) Wall time: 25.1 s #@Divakar's solution np.random.rand(1_000_000, 50).argpartition(6,axis=1)[:,:6]+1 Wall time: 1.36 s #@CoryKramer's solution np.array([np.random.choice(np.arange(1, 50), size=6, replace=False) for _ in range(1_000_000)]) Wall time: 25.5 s
I changed dtypes of np.empty and np.random.randint on @Paul Panzer’s solution because it was not working on my pc.
3.6.0 |Anaconda custom (64-bit)| (default, Dec 23 2016, 11:57:41) [MSC v.1900 64 bit (AMD64)]
Fastest one is
def pp(n): draw = np.empty((n, 6), dtype=np.int64) # generating random numbers is expensive, so draw a large one and # make six out of one draw[:, 0] = np.random.randint(0, 50*49*48*47*46*45, (n,),dtype=np.uint64) draw[:, 1:] = np.arange(50, 45, -1) draw = np.floor_divide.accumulate(draw, axis=-1) draw[:, :-1] -= draw[:, 1:] * np.arange(50, 45, -1) # map the shorter ranges (:49, :48, :47) to the non-occupied # positions; this amounts to incrementing for each number on the # left that is not larger. the nasty bit: if due to incrementing # new numbers on the left are "overtaken" then for them we also # need to increment. for i in range(1, 6): coll = np.sum(draw[:, :i] <= draw[:, i, None], axis=-1) collidx = np.flatnonzero(coll) if collidx.size == 0: continue coll = coll[collidx] tot = coll while True: draw[collidx, i] += coll coll = np.sum(draw[collidx, :i] <= draw[collidx, i, None], axis=-1) relidx = np.flatnonzero(coll > tot) if relidx.size == 0: break coll, tot = coll[relidx]-tot[relidx], coll[relidx] collidx = collidx[relidx] return draw + 1 #@Paul Panzer' solution pp(1_000_000) Wall time: 557 ms
Thank you all.
Advertisement
Answer
Here is a constructive approach, draw first (50 choices), second (49 choices) etc. For large sets it’s quite competitive (pp in table):
# n = 10 # pp 0.18564210 ms # Divakar 0.01960790 ms # James 0.20074140 ms # CK 0.17823420 ms # n = 1000 # pp 0.80046050 ms # Divakar 1.31817130 ms # James 18.93511460 ms # CK 20.83670820 ms # n = 1000000 # pp 655.32905590 ms # Divakar 1352.44713990 ms # James 18471.08987370 ms # CK 18369.79808050 ms # pp checking plausibility... # var (exp obs) 208.333333333 208.363840259 # mean (exp obs) 25.5 25.5064865 # Divakar checking plausibility... # var (exp obs) 208.333333333 208.21113972 # mean (exp obs) 25.5 25.499471 # James checking plausibility... # var (exp obs) 208.333333333 208.313436938 # mean (exp obs) 25.5 25.4979035 # CK checking plausibility... # var (exp obs) 208.333333333 208.169585249 # mean (exp obs) 25.5 25.49
Code including benchmarking. Algo is a bit complicated because mapping to free spots is hairy:
import numpy as np import types from timeit import timeit def f_pp(n): draw = np.empty((n, 6), dtype=int) # generating random numbers is expensive, so draw a large one and # make six out of one draw[:, 0] = np.random.randint(0, 50*49*48*47*46*45, (n,)) draw[:, 1:] = np.arange(50, 45, -1) draw = np.floor_divide.accumulate(draw, axis=-1) draw[:, :-1] -= draw[:, 1:] * np.arange(50, 45, -1) # map the shorter ranges (:49, :48, :47) to the non-occupied # positions; this amounts to incrementing for each number on the # left that is not larger. the nasty bit: if due to incrementing # new numbers on the left are "overtaken" then for them we also # need to increment. for i in range(1, 6): coll = np.sum(draw[:, :i] <= draw[:, i, None], axis=-1) collidx = np.flatnonzero(coll) if collidx.size == 0: continue coll = coll[collidx] tot = coll while True: draw[collidx, i] += coll coll = np.sum(draw[collidx, :i] <= draw[collidx, i, None], axis=-1) relidx = np.flatnonzero(coll > tot) if relidx.size == 0: break coll, tot = coll[relidx]-tot[relidx], coll[relidx] collidx = collidx[relidx] return draw + 1 def check_result(draw, name): print(name[2:], ' checking plausibility...') import scipy.stats assert all(len(set(row)) == 6 for row in draw) assert len(set(draw.ravel())) == 50 print(' var (exp obs)', scipy.stats.uniform(0.5, 50).var(), draw.var()) print(' mean (exp obs)', scipy.stats.uniform(0.5, 50).mean(), draw.mean()) def f_Divakar(n): return np.random.rand(n, 50).argpartition(6,axis=1)[:,:6]+1 def f_James(n): return np.stack([np.random.choice(np.arange(1,51),size=6,replace=False) for i in range(n)]) def f_CK(n): return np.array([np.random.choice(np.arange(1, 51), size=6, replace=False) for _ in range(n)]) for n in (10, 1_000, 1_000_000): print(f'n = {n}') for name, func in list(globals().items()): if not name.startswith('f_') or not isinstance(func, types.FunctionType): continue try: print("{:16s}{:16.8f} ms".format(name[2:], timeit( 'f(n)', globals={'f':func, 'n':n}, number=10)*100)) except: print("{:16s} apparently failed".format(name[2:])) if(n >= 10000): for name, func in list(globals().items()): if name.startswith('f_') and isinstance(func, types.FunctionType): check_result(func(n), name)