'''作业2:判断user-agent,判断是否是正常浏览器访问'''from urllib import requestbase_url = "http://www.langlang2017.com"headers = { "connnction":"keep-alive", "USer_Agent":"mozilla/5.0 (Windows nt 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}req = request.Request(base_url,headers=headers)user_agent = req.get_header("User_agent")print(req.headers,user_agent)# if user_agent:# print("是浏览器访问")# else:# print("不是浏览请求!")# response = request.urlopen(req)## html = response.read()## html = html.decode("utf-8")## ## print(html)'''作业1:假装浏览访问,假装不同的浏览器访问。从user_agent_list.txt文件中,读取user_agent数据,用来封装成一个带headers的request对象,进行网站页面的爬取。'''#1.读取文件内容# text = ""# with open("user_agent_list.txt","r",encoding="utf-8") as f:# text = f.read()# print(text)# f.close()#2.使用文件内容# import random# user_agent_list = text.split("\n")# print(len(user_agent_list))# headers = { # "user_agent":random.choice(user_agent_list)# }## # req = request.Request(base_url,headers=headers)# # response = request.urlopen(req)# # html = response.read()# # html = html.decode("utf-8")# # print(html,req.get_header("User_agent"))## #小结:遇到的问题,一次读全部的user_agent,太占内存,而且大多数用不着。# # 解决方法:使用迭代读取文件内容,确保节省内存。## for line in open("user_agent_list.txt"):# headers = { # "user_agent":line.strip()# }## req = request.Request(base_url,headers=headers)# response = request.urlopen(req)# html = response.read()# html = html.decode("utf-8")# print(html)