摘要:本篇文章探讨了大数据采集之python的docker爬虫技术- 抖音视频抓取(下)(24),希望阅读本篇文章以后大家有所收获,帮助大家对相关内容的理解更加深入。
本篇文章探讨了大数据采集之python的docker爬虫技术- 抖音视频抓取(下)(24),希望阅读本篇文章以后大家有所收获,帮助大家对相关内容的理解更加深入。
"
其实就是复制出来分享页面的函数,然后通过函数,调用的方式完成_signature的生成。
html_foot.txt
<!DOCTYPE html> <html lang=""en""> <head> <meta charset=""UTF-8""> <title>Title</title> </head> <body> </body> </html> <script type=""text/javascript"">
![](https://upload-images.jianshu.io/upload_images/11223715-9daf18c27fb8e850.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) > html_foot.txt ``` python !function(t) { if (t.__M = t.__M || {}, !t.__M.require) { var e, n, r = document.getElementsByTagName(""head"")[0], i = {}, o = {}, a = {}, u = {}, c = {}, s = {}, l = function(t, n) { if (!(t in u)) { u[t] = !0; var i = document.createElement(""script""); if (n) { var o = setTimeout(n, e.timeout); i.onerror = function() { clearTimeout(o), n() } ; var a = function() { clearTimeout(o) }; ""onload""in i ? i.onload = a : i.onreadystatechange = function() { (""loaded"" === this.readyState || ""complete"" === this.readyState) && a() } } return i.type = ""text/javascript"", i.src = t, r.appendChild(i), i } }, f = function(t, e, n) { var r = i[t] || (i[t] = []); r.push(e); var o, a = c[t] || c[t + "".js""] || {}, u = a.pkg; o = u ? s[u].url || s[u].uri : a.url || a.uri || t, l(o, n && function() { n(t) } ) }; n = function(t, e) { ""function"" != typeof e && (e = arguments[2]), t = t.replace(/\.js$/i, """"), o[t] = e; var n = i[t]; if (n) { for (var r = 0, a = n.length; a > r; r++) n[r](); delete i[t] } } , e = function(t) { if (t && t.splice) return e.async.apply(this, arguments); t = e.alias(t); var n = a[t]; if (n) return n.exports; var r = o[t]; if (!r) throw ""[ModJS] Cannot find module `"" + t + ""`""; n = a[t] = { exports: {} }; var i = ""function"" == typeof r ? r.apply(n, [e, n.exports, n]) : r; return i && (n.exports = i), n.exports && !n.exports[""default""] && Object.defineProperty && Object.isExtensible(n.exports) && Object.defineProperty(n.exports, ""default"", { value: n.exports }), n.exports } , e.async = function(n, r, i) { function a(t) { for (var n, r = 0, h = t.length; h > r; r++) { var p = e.alias(t[r]); p in o ? (n = c[p] || c[p + "".js""], n && ""deps""in n && a(n.deps)) : p in s || (s[p] = !0, l++, f(p, u, i), n = c[p] || c[p + "".js""], n && ""deps""in n && a(n.deps)) } } function u() { if (0 === l--) { for (var i = [], o = 0, a = n.length; a > o; o++) i[o] = e(n[o]); r && r.apply(t, i) } } ""string"" == typeof n && (n = [n]); var s = {} , l = 0; a(n), u() } , e.resourceMap = function(t) { var e, n; n = t.res; for (e in n) n.hasOwnProperty(e) && (c[e] = n[e]); n = t.pkg; for (e in n) n.hasOwnProperty(e) && (s[e] = n[e]) } , e.loadJs = function(t) { l(t) } , e.loadCss = function(t) { if (t.content) { var e = document.createElement(""style""); e.type = ""text/css"", e.styleSheet ? e.styleSheet.cssText = t.content : e.innerHTML = t.content, r.appendChild(e) } else if (t.url) { var n = document.createElement(""link""); n.href = t.url, n.rel = ""stylesheet"", n.type = ""text/css"", r.appendChild(n) } } , e.alias = function(t) { return t.replace(/\.js$/i, """") } , e.timeout = 5e3, t.__M.define = n, t.__M.require = e } }(this) __M.define(""douyin_falcon:node_modules/byted-acrawler/dist/runtime"", function(l, e) { Function(function(l) { return 'e(e,a,r){(b[e]||(b[e]=t(""x,y"",""x ""+e+"" y"")(r,a)}a(e,a,r){(k[r]||(k[r]=t(""x,y"",""new x[y](""+Array(r+1).join("",x[y]"")(1)+"")"")(e,a)}r(e,a,r){n,t,s={},b=s.d=r?r.d+1:0;for(s[""$""+b]=s,t=0;t<b;t)s[n=""$""+t]=r[n];for(t=0,b=s=a;t<b;t)s[t]=a[t];c(e,0,s)}c(t,b,k){u(e){v[x]=e}f{g=,ting(bg)}l{try{y=c(t,b,k)}catch(e){h=e,y=l}}for(h,y,d,g,v=[],x=0;;)switch(g=){case 1:u(!)4:f5:u((e){a=0,r=e;{c=a<r;c&&u(e[a]),c}}(6:y=,u((y8:if(g=,lg,g=,y===c)b+=g;else if(y!==l)y9:c10:u(s(11:y=,u(+y)12:for(y=f,d=[],g=0;g>>065:h=,y=,[y]=h66:u(e(t[b],,67:y=,d=,u((g=).x===c?r(g.y,y,k):g.apply(d,y68:u(e((g=t[b])<""<""?(b--,f):g+g,,70:u(!1)71:n72:+f73:u(parseInt(f,3675:if(){bcase 74:g=<>16g76:u(k[])77:y=,u([y])78:g=,u(a(v,x-=g+1,g79:g=,u(k[""$""+g])81:h=,[f]=h82:u([f])83:h=,k[]=h84:!085:void 086:u(v[x-1])88:h=,y=,h,y89:u({e{r(e.y,arguments,k)}e.y=f,e.x=c,e})90:null91:h93:h=0:;default:u((g<>16)-16)}}n=this,t=n.Function,s=Object.keys||(e){a={},r=0;for(c in e)a[r]=c;a=r,a},b={},k={};r'.replace(/[-]/g, function(e) { return l[15 & e.charCodeAt(0)] }) }(""v[x++]=v[--x]t.charCodeAt(b++)-32function return ))++.substrvar .length(),b+=;break;case ;break}"".split("""")))()('gr$Daten Иb/s!l y͒yĹg,(lfi~ah`{mv,-n|jqewVxp{rvmmx,&effkx[!cs""l"".Pq%widthl""@q&heightl""vr*getContextx$""2d[!cs#l#,*;?|u.|uc{uq$fontl#vr(fillTextx$$龘ฑภ경2<[#c}l#2q*shadowBlurl#1q-shadowOffsetXl#$$limeq+shadowColorl#vr#arcx88802[%c}l#vr&strokex[ c}l""v,)}eOmyoZB]mx[ cs!0s$l$Pb>>s!0s%yA0s""l""l!r&lengthb<k+l""^l""1+s""jl s&l&z0l!$ +[""cs\'(0l#i\'1ps9wxb&s() &{s)/s(gr&Stringr,fromCharCodes)0s*yWl ._b&s o!])l l Jb<k$.aj;l .Tb<k$.gj/l .^b&l!l Bd>&+l!l &+l!l 6d>&+l!l &+ s,y=o!o!]/q""13o!l q""10o!],l 2d>& s.{s-yMo!o!]0q""13o!]*Ld>>b|s!o!l q""10o!],l!& s/yIo!o!].q""13o!],o!]*Jd>>b|&o!]+l &+ s0l-l!&l-l!i\'1z141z4b/@d<l""b|&+l-l(l!b^&+l-l&zl\'g,)gk}ejo{cm,)|yn~Lij~em[""cl$b%@d<l&zl\'l $ +[""cl$b%b|&+l-l%8d<@b|l!b^&+ q$sign ', [Object.defineProperty(e, ""__esModule"", { value: !0 })]) }); dycs = __M.require(""douyin_falcon:node_modules/byted-acrawler/dist/runtime"") signc = dycs.sign(&&&&)document.title = signcdocument.write(signc)
handle_douyin_movie.py 下载代码
#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time : 2019/2/20 17:39# @Author : Aries# @Site : # @File : handle_douyin_movie.py.py# @Software: PyCharmimport jsonimport osimport requestsimport reimport timefrom selenium import webdriverfrom selenium.webdriver.chrome.options import Options#分享IDshare_id = ""89923219116""share_url = ""https://www.douyin.com/share/user/""+share_id header = { ""User-Agent"":""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36""}#dytk 和tac的正则表达式dytk_search = re.compile(r""dytk: '(.*?)'"") tac_search = re.compile(r"""") response = requests.get(url=share_url,headers=header)#处理获取dytk 和tacdytk = re.search(dytk_search,response.text).group(1) tac = re.search(tac_search,response.text).group(1)#tac封装成为js的格式tac = ""var tac=""+tac+"";""# html页面的编写合成 header + tac+ footwith open(""html_head.txt"") as f1: f1_read = f1.read()with open(""html_foot.txt"") as f2: f2_read = f2.read().replace(""&&&&"",""89923219116"")with open(""test.html"",""w"") as f_w: f_w.write(f1_read+""\n""+tac+""\n""+f2_read)# signature = input(""秘钥为:"")chrome_options = Options() chrome_options.add_argument(""--headless"") abspath = os.path.abspath(r""D:\Program Files\chromedriver\chromedriver.exe"") douyin_driver = webdriver.Chrome(executable_path=abspath,chrome_options=chrome_options,) douyin_driver.get(""file:///E:\\dockerpython\\python\\douyin\\test.html"") signature = douyin_driver.title douyin_driver.quit() movie_url = ""https://www.douyin.com/aweme/v1/aweme/post/?user_id=""+share_id+""&count=21&max_cursor=0&aid=1128&_signature=""+signature+""&dytk=""+dytk#接口不太稳定,所以要使用while循环一直调用while True: movie_reponse = requests.get(url=movie_url,headers=header) if json.loads(movie_reponse.text)[""aweme_list""] == []: #time.sleep(1) continue else: print(movie_reponse.text) for item in json.loads(movie_reponse.text)[""aweme_list""]: video_url = item[""video""][""play_addr""][""url_list""][0] video_response = requests.get(url=video_url,headers=header) with open(""douyin.mp4"",""wb"") as v: #不能使用video_response.text,必须使用content才可以把内容写进去 v.write(video_response.content) break
最终结果
里面关于chromedriver的配置直接引入他的路径最稳了,我比较喜欢这种方式网上很多搞环境变量的导致电脑很慢不建议。
PS:基本上抖音视频下载的都已经完成了,下次对于需要注意的做下总结。
" 本文由职坐标整理发布,学习更多的相关知识,请关注职坐标IT知识库!
您输入的评论内容中包含违禁敏感词
我知道了
请输入正确的手机号码
请输入正确的验证码
您今天的短信下发次数太多了,明天再试试吧!
我们会在第一时间安排职业规划师联系您!
您也可以联系我们的职业规划师咨询:
版权所有 职坐标-一站式IT培训就业服务领导者 沪ICP备13042190号-4
上海海同信息科技有限公司 Copyright ©2015 www.zhizuobiao.com,All Rights Reserved.
沪公网安备 31011502005948号