您的
crawl方法正在每个单独的任务之后等待;您应该将其更改为:
async def crawl(self): tasks = [] while self.active_fetches < self.MAX_ACTIVE_FETCHES: if self.urlq: url = self.urlq.pop() tasks.append(asyncio.create_task(self.fetch(url))) await asyncio.gather(*tasks)
编辑 :这是带有注释的更干净的版本,它可以同时提取和处理所有注释,同时保留了限制最大提取器数量的基本功能。
import asyncioclass Crawler: def __init__(self, urls, max_workers=2): self.urls = urls # create a queue that only allows a maximum of two items self.fetching = asyncio.Queue() self.max_workers = max_workers async def crawl(self): # DON'T await here; start consuming things out of the queue, and # meanwhile execution of this function continues. We'll start two # coroutines for fetching and two coroutines for processing. all_the_coros = asyncio.gather( *[self._worker(i) for i in range(self.max_workers)]) # place all URLs on the queue for url in self.urls: await self.fetching.put(url) # now put a bunch of `None`'s in the queue as signals to the workers # that there are no more items in the queue. for _ in range(self.max_workers): await self.fetching.put(None) # now make sure everything is done await all_the_coros async def _worker(self, i): while True: url = await self.fetching.get() if url is None: # this coroutine is done; simply return to exit return print(f'Fetch worker {i} is fetching a URL: {url}') page = await self.fetch(url) self.process(page) async def fetch(self, url): print("Fetching URL: " + url); await asyncio.sleep(2) return f"the contents of {url}" def process(self, page): print("processed page: " + page)# main loopc = Crawler(['http://www.google.com', 'http://www.yahoo.com', 'http://www.cnn.com', 'http://www.gamespot.com', 'http://www.facebook.com', 'http://www.evergreen.edu'])asyncio.run(c.crawl())


