目录
1.socket通信:
1.1 osi模型:
2、协程:
2.1 利用yield实现协程:
2.2 greenlet:可实现单线程内切换多个任务
2、3 gevent: 可实现协程也是由C扩展而来的 a.阻塞式协程
b、非阻塞, 上述例子中gevent识别io阻塞。如果想实现非阻塞模型需要导入, #167和#168两行代码
1.socket通信:
1.1 osi模型:
1.1 osi模型:
实现通信
物理层----数据链路层---网络层----传输层----应用---表示----回话
传输层--socket(网络套接字,可实现两个主要通信架构c/s---UDP/TCP)
mysql---tcp
2、协程:
提升通信效率的手段,单线程下实现并发。
优点:
1)无需线程可以切换上下文,避免无意义的调度,可提升代码性能
2)无需原子操作可以锁定及同步开销
3)简化编程
4) 高并发 高扩展 低成本
缺点:
1)无法利用多核资源
2)阻塞会导致程序崩溃
进程
线程
协程
2.1 利用yield实现协程:
import time def func1(): while True: yield def func2(): g = func1() for i in range(10000): i+=1 next(g) def main(): start = time.time() func2() print(time.time()-start) if __name__ == '__main__': main()
2.2 greenlet:可实现单线程内切换多个任务
import greenlet # 返回greenlet数据类型 greenlet.greenlet # 创建greenlet对象,不执行,run是greenlet执行的回调函数 greenlet(run=None,parent=None) # 返回当前greenlet,谁调用返回谁 greenlet.getcurrent() # 用于清除掉一个greenlet对象而不影响父辈greenlet greenlet.GreenletExit 'dead', greenlet执行结束的标志 'error', 'getcurrent', 获取当前执行的greenlet 'gettrace', 获取链路追踪 'gr_frame', 'parent', 每个greenlet都有一个parent 'run', 当greenlet启动时调用run方法 'settrace', 设置链路追踪,可查看协程切换过程 'switch', 协程之间切换 'throw' 协程抛出异常
实例1:
import greenlet def FunA(): print('FunA') g2.switch() print('FunB') g2.switch() def FunB(): print('FunB') g1.switch() print('FunA') if __name__ == '__main__': g1 = greenlet.greenlet(FunA) g2 = greenlet.greenlet(FunB) g1.switch()
2、3 gevent: 可实现协程也是由C扩展而来的
a.阻塞式协程
import gevent,time def eat(name): print('%s eat 1'%name) time.sleep(2) print('%s eat 2' % name) def study(name): print('%s study 1'%name) time.sleep(3) print('%s study 2' % name) def main(): g1 = gevent.spawn(eat,'Alex') g2 = gevent.spawn(study,'Jill') g1.join() g2.join() print('main') if __name__ == '__main__': main()
b、非阻塞, 上述例子中gevent识别io阻塞。如果想实现非阻塞模型需要导入, #167和#168两行代码
import gevent,time from gevent import monkey monkey.patch_all() def eat(name): print('%s eat 1'%name) time.sleep(2) print('%s eat 2' % name) def study(name): print('%s study 1'%name) time.sleep(3) print('%s study 2' % name) def main(): g1 = gevent.spawn(eat,'Alex') g2 = gevent.spawn(study,'Jill') g1.join() g2.join() print('main') if __name__ == '__main__': main()
课后作业:616971722@qq.com 仿照现有代码实现greenlet协程爬虫 import time,os,socket from urllib.parse import urlparse urls = [ 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f0932b2-580x413.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1da6e5-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e7e54-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e5b2d-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f223258-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f227243-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24b9c2-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24c126-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f256263-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f297a87-580x825.jpg!page-800', ] class Crawler: def __init__(self, url): self.url = url self.receive_data = b'' def fetch(self): # url是获取的域名和路径 url = urlparse(self.url) # 创建socket实例 self.socket = socket.socket() # 使用阻塞方法连接服务器,web服务器通用端口号默认80 self.socket.connect((url.netloc,80)) print('连接成功') os.system('mkdir -p pic') data = 'GET {} HTTP/1.1rnHOST:{}rnConnection: closernrn'.format(url.path, url.netloc) # 向服务器发送数据 self.socket.send(data.encode()) while True: d = self.socket.recv(1024) if d: self.receive_data+=d else: break print('数据接收成功') with open('pic/{}'.format(url.path[1:]),'wb') as f: f.write(self.receive_data.split(b'rnrn')[1]) print('文件保存成功') self.socket.close() def main(): start = time.time() for url in urls: crawler = Crawler(url) crawler.fetch() print('耗时:{:.2f}s'.format(time.time()-start)) if __name__ == '__main__': main()
3、greenlet协程爬虫 1)普通的爬虫方式:urllib.parse
实例:耗时-1.26s
import time, os, socketfrom urllib.parse import urlparseurls = [ 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f0932b2-580x413.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1da6e5-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e7e54-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e5b2d-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f223258-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f227243-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24b9c2-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24c126-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f256263-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f297a87-580x825.jpg!page-800',]# 创建的爬虫类class Crawler: # 构造方法:初始化自带属性, # 分别为url属性-输入的数据 # 以及receive_data属性-输出数据 def __init__(self, url): self.url = url self.receive_data = b'' # 实现爬虫的主要方法 def fetch(self): # url是获取的域名和路径,目的是在执行爬虫代码之前先验证图片地址是否可用,截止到#111行 url = urlparse(self.url) # 创建socket实例 self.socket = socket.socket() # 使用阻塞方法连接服务器,web服务器通用端口号默认80 self.socket.connect((url.netloc, 80)) print('连接成功') os.system('mkdir pic') # 当图片可用时,创建文件夹,接收数据,保存数据,截止到#126行 data = 'GET {} HTTP/1.1rnHOST:{}rnConnection: closernrn'.format(url.path, url.netloc) # 向服务器发送数据 self.socket.send(data.encode()) while True: d = self.socket.recv(1024) if d: # bytes字节码或字节流 self.receive_data += d else: break print('数据接收成功') filename = self.url.split('/')[-1][:-9] with open('pic/{}'.format(filename), 'wb') as f: f.write(self.receive_data.split(b'rnrn')[1]) print('文件保存成功') self.socket.close()def main(): start = time.time() for url in urls: crawler = Crawler(url) crawler.fetch() print('耗时:{:.2f}s'.format(time.time() - start))if __name__ == '__main__': main()
2)使用greenlet-总耗时:0.620s
#导入相应的模块: import socket,os,time from greenlet import greenlet #实现协程 from urllib.parse import urlparse # 实现IO多路复用 from selectors import DefaultSelector, EVENT_WRITE,EVENT_READ selector = DefaultSelector() stopped = False urls = [ 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f0932b2-580x413.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1da6e5-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e7e54-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e5b2d-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f223258-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f227243-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24b9c2-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24c126-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f256263-580x825.jpg!page-800', 'https://oss.mkzcdn.com/comic/page/20191005/5d9899f297a87-580x825.jpg!page-800', ] # 创建协程控制类 class Hub(): def wait(self): # 创建Waiter实例 waiter = Waiter() # 设置回调方法为waiter的swich方法 self.callback = waiter.switch # 将waiter实例返回值通过get方法获取 return waiter.get() def set_result(self,data): self.callback(data) class Waiter: def __init__(self): #创建主协程 self.main_gr = main_gr def switch(self, value): # 切换协程,保存当前协程 self.gr.switch(value) def get(self): # 将当前协程赋值给greenlet属性,当前协程就是Crawler实例的fetch方法 self.gr = greenlet.getcurrent() return self.main_gr.switch() # 创建爬虫类 class Crawler: # 构造方法,创建属性,并赋值 def __init__(self, url): self._url = url self.url = urlparse(url) self.response = b'' def fetch(self): global stopped # 创建套接字 sock = socket.socket() # 设置套接字为非阻塞模式 sock.setblocking(False) try: sock.connect((self.url.netloc, 80)) except BlockingIOError: pass # 创建Hub调度实例 h = Hub() # 写事件的回调,套接字的写事件就绪时运行如下函数 def writable(): h.set_result(None) # 实例化selector,注册监听事件 selector.register(sock.fileno(),EVENT_WRITE, writable) h.wait() # 切换回来注销套接字监听事件 selector.unregister(sock.fileno()) data = 'GET {} HTTP/1.1rnHost:{}rnConnection: closernrn'.format(self.url.path,self.url.netloc) # 向服务器发送请求 sock.send(data.encode()) # 读取数据,创建读事件监听回调事件就是readable def readable(): h.set_result(sock.recv(4096)) # 注册套接字读事件 selector.register(sock.fileno(), EVENT_READ, readable) while True: data = h.wait() if data: self.response += data else: # 数据接收完,注销客户端监听 selector.unregister(sock.fileno()) # 从列表中移除对应的URL urls.remove(self._url) if not urls: stopped = True # 转存图片到本地 filename = self.url.path.split('/')[-1][:-9] with open('pic/'+filename,'wb') as f: f.write(self.response.split(b'rnrn')[1]) print('URL:{}下在完成'.format(self.url.path)) break def crawler(): # 循环创建实例 for url in urls: crawler = Crawler(url) # 使用fetch方法爬取数据 gr = greenlet(crawler.fetch) gr.switch() # 将crawler函数作为参数创建协程,该协程是主协程。在这个协程内部会创建子协程 main_gr = greenlet(crawler) # 事件循环函数 def loop(): while not stopped: events = selector.select() for event_key,_ in events: callback = event_key.data callback() def main(): start = time.time() os.system('mkdir pic') # 切换到父协程main_gr中运行 main_gr.switch() # 执行循环事件 loop() print('总耗时:{:.3f}s'.format(time.time()-start)) if __name__ == '__main__': main()