You can also consider using a set, but constructing that set from your list may take more time than faster membership testing will save. The only way to be certain is to benchmark well. (this also depends on what operations you require)
import randomimport bisectimport matplotlib.pyplot as pltimport mathimport timedef method_in(a,b,c):
start_time = time.time()for i,x in enumerate(a):if x in b:
c[i]=1return(time.time()-start_time)def method_set_in(a,b,c):
start_time = time.time()
s = set(b)for i,x in enumerate(a):if x in s:
c[i]=1return(time.time()-start_time)def method_bisect(a,b,c):
start_time = time.time()
b.sort()for i,x in enumerate(a):
index = bisect.bisect_left(b,x)if index < len(a):if x == b[index]:
c[i]=1return(time.time()-start_time)def profile():
time_method_in =[]
time_method_set_in =[]
time_method_bisect =[]Nls=[x for x in range(1000,20000,1000)]for N inNls:
a =[x for x in range(0,N)]
random.shuffle(a)
b =[x for x in range(0,N)]
random.shuffle(b)
c =[0for x in range(0,N)]
time_method_in.append(math.log(method_in(a,b,c)))
time_method_set_in.append(math.log(method_set_in(a,b,c)))
time_method_bisect.append(math.log(method_bisect(a,b,c)))
plt.plot(Nls,time_method_in,marker='o',color='r',linestyle='-',label='in')
plt.plot(Nls,time_method_set_in,marker='o',color='b',linestyle='-',label='set')
plt.plot(Nls,time_method_bisect,marker='o',color='g',linestyle='-',label='bisect')
plt.xlabel('list size', fontsize=18)
plt.ylabel('log(time)', fontsize=18)
plt.legend(loc ='upper left')
plt.show()
As stated by others, in can be very slow for large lists. Here are some comparisons of the performances for in, set and bisect. Note the time (in second) is in log scale.
Code for testing:
import random
import bisect
import matplotlib.pyplot as plt
import math
import time
def method_in(a,b,c):
start_time = time.time()
for i,x in enumerate(a):
if x in b:
c[i] = 1
return(time.time()-start_time)
def method_set_in(a,b,c):
start_time = time.time()
s = set(b)
for i,x in enumerate(a):
if x in s:
c[i] = 1
return(time.time()-start_time)
def method_bisect(a,b,c):
start_time = time.time()
b.sort()
for i,x in enumerate(a):
index = bisect.bisect_left(b,x)
if index < len(a):
if x == b[index]:
c[i] = 1
return(time.time()-start_time)
def profile():
time_method_in = []
time_method_set_in = []
time_method_bisect = []
Nls = [x for x in range(1000,20000,1000)]
for N in Nls:
a = [x for x in range(0,N)]
random.shuffle(a)
b = [x for x in range(0,N)]
random.shuffle(b)
c = [0 for x in range(0,N)]
time_method_in.append(math.log(method_in(a,b,c)))
time_method_set_in.append(math.log(method_set_in(a,b,c)))
time_method_bisect.append(math.log(method_bisect(a,b,c)))
plt.plot(Nls,time_method_in,marker='o',color='r',linestyle='-',label='in')
plt.plot(Nls,time_method_set_in,marker='o',color='b',linestyle='-',label='set')
plt.plot(Nls,time_method_bisect,marker='o',color='g',linestyle='-',label='bisect')
plt.xlabel('list size', fontsize=18)
plt.ylabel('log(time)', fontsize=18)
plt.legend(loc = 'upper left')
plt.show()
You could put your items into a set. Set lookups are very efficient.
Try:
s = set(a)
if 7 in s:
# do stuff
edit In a comment you say that you’d like to get the index of the element. Unfortunately, sets have no notion of element position. An alternative is to pre-sort your list and then use binary search every time you need to find an element.
回答 3
def check_availability(element, collection: iter):return element in collection
a = [4,2,3,1,5,6]
index = dict((y,x) for x,y in enumerate(a))
try:
a_index = index[7]
except KeyError:
print "Not found"
else:
print "found"
This will only be a good idea if a doesn’t change and thus we can do the dict() part once and then use it repeatedly. If a does change, please provide more detail on what you are doing.
import random
import bisect
import matplotlib.pyplot as plt
import math
import timeit
import itertools
def wrapper(func,*args,**kwargs):" Use to produced 0 argument function for call it"# Reference https://www.pythoncentral.io/time-a-python-function/def wrapped():return func(*args,**kwargs)return wrapped
def method_in(a,b,c):for i,x in enumerate(a):if x in b:
c[i]= b.index(x)else:
c[i]=-1return c
def method_try(a,b,c):for i, x in enumerate(a):try:
c[i]= b.index(x)exceptValueError:
c[i]=-1def method_set_in(a,b,c):
s = set(b)for i,x in enumerate(a):if x in s:
c[i]= b.index(x)else:
c[i]=-1return c
def method_bisect(a,b,c):" Finds indexes using bisection "# Create a sorted b with its index
bsorted = sorted([(x, i)for i, x in enumerate(b)], key =lambda t: t[0])for i,x in enumerate(a):
index = bisect.bisect_left(bsorted,(x,))
c[i]=-1if index < len(a):if x == bsorted[index][0]:
c[i]= bsorted[index][1]# index in the b arrayreturn c
def method_reverse_lookup(a, b, c):
reverse_lookup ={x:i for i, x in enumerate(b)}for i, x in enumerate(a):
c[i]= reverse_lookup.get(x,-1)return c
def profile():Nls=[x for x in range(1000,20000,1000)]
number_iterations =10
methods =[method_in, method_try, method_set_in, method_bisect, method_reverse_lookup]
time_methods =[[]for _ in range(len(methods))]for N inNls:
a =[x for x in range(0,N)]
random.shuffle(a)
b =[x for x in range(0,N)]
random.shuffle(b)
c =[0for x in range(0,N)]for i, func in enumerate(methods):
wrapped = wrapper(func, a, b, c)
time_methods[i].append(math.log(timeit.timeit(wrapped, number=number_iterations)))
markers = itertools.cycle(('o','+','.','>','2'))
colors = itertools.cycle(('r','b','g','y','c'))
labels = itertools.cycle(('in','try','set','bisect','reverse'))for i in range(len(time_methods)):
plt.plot(Nls,time_methods[i],marker = next(markers),color=next(colors),linestyle='-',label=next(labels))
plt.xlabel('list size', fontsize=18)
plt.ylabel('log(time)', fontsize=18)
plt.legend(loc ='upper left')
plt.show()
profile()
What is the fastest way to know if a value exists in a list (a list
with millions of values in it) and what its index is?
Thus there are two things to find:
is an item in the list, and
what is the index (if in the list).
Towards this, I modified @xslittlegrass code to compute indexes in all cases, and added an additional method.
Results
Methods are:
in–basically if x in b: return b.index(x)
try–try/catch on b.index(x) (skips having to check if x in b)
set–basically if x in set(b): return b.index(x)
bisect–sort b with its index, binary search for x in sorted(b).
Note mod from @xslittlegrass who returns the index in the sorted b,
rather than the original b)
reverse–form a reverse lookup dictionary d for b; then
d[x] provides the index of x.
Results show that method 5 is the fastest.
Interestingly the try and the set methods are equivalent in time.
Test Code
import random
import bisect
import matplotlib.pyplot as plt
import math
import timeit
import itertools
def wrapper(func, *args, **kwargs):
" Use to produced 0 argument function for call it"
# Reference https://www.pythoncentral.io/time-a-python-function/
def wrapped():
return func(*args, **kwargs)
return wrapped
def method_in(a,b,c):
for i,x in enumerate(a):
if x in b:
c[i] = b.index(x)
else:
c[i] = -1
return c
def method_try(a,b,c):
for i, x in enumerate(a):
try:
c[i] = b.index(x)
except ValueError:
c[i] = -1
def method_set_in(a,b,c):
s = set(b)
for i,x in enumerate(a):
if x in s:
c[i] = b.index(x)
else:
c[i] = -1
return c
def method_bisect(a,b,c):
" Finds indexes using bisection "
# Create a sorted b with its index
bsorted = sorted([(x, i) for i, x in enumerate(b)], key = lambda t: t[0])
for i,x in enumerate(a):
index = bisect.bisect_left(bsorted,(x, ))
c[i] = -1
if index < len(a):
if x == bsorted[index][0]:
c[i] = bsorted[index][1] # index in the b array
return c
def method_reverse_lookup(a, b, c):
reverse_lookup = {x:i for i, x in enumerate(b)}
for i, x in enumerate(a):
c[i] = reverse_lookup.get(x, -1)
return c
def profile():
Nls = [x for x in range(1000,20000,1000)]
number_iterations = 10
methods = [method_in, method_try, method_set_in, method_bisect, method_reverse_lookup]
time_methods = [[] for _ in range(len(methods))]
for N in Nls:
a = [x for x in range(0,N)]
random.shuffle(a)
b = [x for x in range(0,N)]
random.shuffle(b)
c = [0 for x in range(0,N)]
for i, func in enumerate(methods):
wrapped = wrapper(func, a, b, c)
time_methods[i].append(math.log(timeit.timeit(wrapped, number=number_iterations)))
markers = itertools.cycle(('o', '+', '.', '>', '2'))
colors = itertools.cycle(('r', 'b', 'g', 'y', 'c'))
labels = itertools.cycle(('in', 'try', 'set', 'bisect', 'reverse'))
for i in range(len(time_methods)):
plt.plot(Nls,time_methods[i],marker = next(markers),color=next(colors),linestyle='-',label=next(labels))
plt.xlabel('list size', fontsize=18)
plt.ylabel('log(time)', fontsize=18)
plt.legend(loc = 'upper left')
plt.show()
profile()
It sounds like your application might gain advantage from the use of a Bloom Filter data structure.
In short, a bloom filter look-up can tell you very quickly if a value is DEFINITELY NOT present in a set. Otherwise, you can do a slower look-up to get the index of a value that POSSIBLY MIGHT BE in the list. So if your application tends to get the “not found” result much more often then the “found” result, you might see a speed up by adding a Bloom Filter.
For details, Wikipedia provides a good overview of how Bloom Filters work, and a web search for “python bloom filter library” will provide at least a couple useful implementations.
for element in s:if element is target:# fast check for identity implies equalityreturnTrueif element == target:# slower check for actual equalityreturnTruereturnFalse
Be aware that the in operator tests not only equality (==) but also identity (is), the in logic for lists is roughly equivalent to the following (it’s actually written in C and not Python though, at least in CPython):
for element in s:
if element is target:
# fast check for identity implies equality
return True
if element == target:
# slower check for actual equality
return True
return False
In most circumstances this detail is irrelevant, but in some circumstances it might leave a Python novice surprised, for example, numpy.NAN has the unusual property of being not being equal to itself:
>>> import numpy
>>> numpy.NAN == numpy.NAN
False
>>> numpy.NAN is numpy.NAN
True
>>> numpy.NAN in [numpy.NAN]
True
To distinguish between these unusual cases you could use any() like:
>>> lst = [numpy.NAN, 1 , 2]
>>> any(element == numpy.NAN for element in lst)
False
>>> any(element is numpy.NAN for element in lst)
True
Note the in logic for lists with any() would be:
any(element is target or element == target for element in lst)
However, I should emphasize that this is an edge case, and for the vast majority of cases the in operator is highly optimised and exactly what you want of course (either with a list or with a set).
@Winston Ewert’s solution yields a big speed-up for very large lists, but this stackoverflow answer indicates that the the try:/except:/else: construct will be slowed down if the except branch is often reached. An alternative is to take advantage of the .get() method for the dict:
a = [4,2,3,1,5,6]
index = dict((y, x) for x, y in enumerate(a))
b = index.get(7, None)
if b is not None:
"Do something with variable b"
The .get(key, default) method is just for the case when you can’t guarantee a key will be in the dict. If key is present, it returns the value (as would dict[key]), but when it is not, .get() returns your default value (here None). You need to make sure in this case that the chosen default will not be in a.
This is not the code, but the algorithm for very fast searching.
If your list and the value you are looking for are all numbers, this is pretty straightforward. If strings: look at the bottom:
-Let “n” be the length of your list
-Optional step: if you need the index of the element: add a second column to the list with current index of elements (0 to n-1) – see later
Order your list or a copy of it (.sort())
Loop through:
Compare your number to the n/2th element of the list
If larger, loop again between indexes n/2-n
If smaller, loop again between indexes 0-n/2
If the same: you found it
Keep narrowing the list until you have found it or only have 2 numbers (below and above the one you are looking for)
This will find any element in at most 19 steps for a list of 1.000.000 (log(2)n to be precise)
If you also need the original position of your number, look for it in the second, index column.
If your list is not made of numbers, the method still works and will be fastest, but you may need to define a function which can compare/order strings.
Of course, this needs the investment of the sorted() method, but if you keep reusing the same list for checking, it may be worth it.
回答 11
因为问题不一定总是被理解为最快的技术方法-我总是建议理解/编写最直接的最快方法:列表理解,单线
[i for i in list_from_which_to_search if i in list_to_search_in]
Because the question is not always supposed to be understood as the fastest technical way – I always suggest the most straightforward fastest way to understand/write: a list comprehension, one-liner
[i for i in list_from_which_to_search if i in list_to_search_in]
I had a list_to_search_in with all the items, and wanted to return the indexes of the items in the list_from_which_to_search.
This returns the indexes in a nice list.
There are other ways to check this problem – however list comprehensions are quick enough, adding to the fact of writing it quick enough, to solve a problem.
回答 12
对我而言,这是0.030秒(实际),0.026秒(用户)和0.004秒(系统)。
try:print("Started")
x =["a","b","c","d","e","f"]
i =0while i < len(x):
i +=1if x[i]=="e":print("Found")exceptIndexError:pass