生活资讯
如何抓取微博关键词数据库 微博扒取网络数据
2023-04-15 21:00  浏览:50

如何抓取微博关键词数据库 微博扒取网络数据?String strURL="https://weibo.com/u/3502967407";//https://weibo.com/u/3081728031 5723344072,接下来我们就来聊聊关于如何抓取微博关键词数据库 微博扒取网络数据?以下内容大家不妨参考一二希望能帮到您!

如何抓取微博关键词数据库 微博扒取网络数据 如何抓取微博关键词数据库 微博扒取网络数据

String strURL="https://weibo.com/u/3502967407";//https://weibo.com/u/3081728031 5723344072

URL url=null;

HttpURLConnection httpConn=null;

url = new URL(strURL);

httpConn = (HttpURLConnection) url.openConnection();

//String c="SUB=_2AkMqj-zif8NxqwJRmfkcyG7la4R0ygjEieKc0x05JRMxHRl-yT9jqhUitRB6AQ_CDRrmGwjoWaf2alXg9Yfxki-R4Nwe; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5gfVwXwLLzATj6ArcV1q7i; SINAGLOBAL=2525797642447.1143.1576751690811; _s_tentry=localhost:8080; Apache=9113724801556.377.1583116766626; ULV=1583116766636:2:1:1:9113724801556.377.1583116766626:1582854844672; TC-V5-G0=4de7df00d4dc12eb0897c97413797808; login_sid_t=96a715575970779900d6d744eadd4ef1; cross_origin_proto=SSL; UOR=,,localhost:8080; Ugrow-G0=140ad66ad7317901fc818d7fd7743564; wb_view_log=1920*10801; TC-Page-G0=1ae767ccb34a580ffdaaa3a58eb208b8|1584343362|1584343362";

//String c="SINAGLOBAL=2525797642447.1143.1576751690811; _s_tentry=localhost:8080; Apache=9113724801556.377.1583116766626; ULV=1583116766636:2:1:1:9113724801556.377.1583116766626:1582854844672; TC-V5-G0=4de7df00d4dc12eb0897c97413797808; login_sid_t=96a715575970779900d6d744eadd4ef1; cross_origin_proto=SSL; Ugrow-G0=140ad66ad7317901fc818d7fd7743564; WBtopGlobal_register_version=3d5b6de7399dfbdb; wb_view_log_6439293145=1920*10801; wb_view_log=1920*10801; UOR=,,www.sina.com.cn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWlIp9eUCCs0AXkbTy9zp7x5JpX5K2hUgL.Foqpeo-NeKqNS0.2dJLoIEXLxKqLBonL1h-LxKMLB.2LB-qLxKML1-2L1hBLxKnLBKqL1h2LxKqLB-BLB.zt; ALF=1615971499; SSOLoginState=1584435500; SCF=AjcCfB6DUrrZ2fMhnntI_TyQc2JsccpWc3X4bHbuPEpJHcrUMAiEDq2Fby6kEoayWwopa6y9lMEbqh1h7NHOffM.; SUB=_2A25zdOF8DeRhGeBP6VcW8SjLzDWIHXVQAFW0rDV8PUNbmtANLUr-kW9NRWCbkhivb5UzMh1zGT7KgW6D-dSnnHFj; SUHB=0sqBpKsKWKrq8Z; un=18595757685; wvr=6; wb_view_log_6125716779=1920*10801; TC-Page-G0=1ae767ccb34a580ffdaaa3a58eb208b8|1584440113|1584440107; webim_unReadCount={"time":1584440351054,"dm_pub_total":0,"chat_group_client":0,"allcountNum":3,"msgbox":0}";

//String c="SINAGLOBAL=2525797642447.1143.1576751690811; _s_tentry=localhost:8080; Apache=9113724801556.377.1583116766626; ULV=1583116766636:2:1:1:9113724801556.377.1583116766626:1582854844672; TC-V5-G0=4de7df00d4dc12eb0897c97413797808; login_sid_t=96a715575970779900d6d744eadd4ef1; cross_origin_proto=SSL; Ugrow-G0=140ad66ad7317901fc818d7fd7743564; WBtopGlobal_register_version=3d5b6de7399dfbdb; wb_view_log_6439293145=1920*10801; wb_view_log=1920*10801; UOR=,,www.sina.com.cn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWlIp9eUCCs0AXkbTy9zp7x5JpX5K2hUgL.Foqpeo-NeKqNS0.2dJLoIEXLxKqLBonL1h-LxKMLB.2LB-qLxKML1-2L1hBLxKnLBKqL1h2LxKqLB-BLB.zt; ALF=1615971499; SSOLoginState=1584435500; SCF=AjcCfB6DUrrZ2fMhnntI_TyQc2JsccpWc3X4bHbuPEpJHcrUMAiEDq2Fby6kEoayWwopa6y9lMEbqh1h7NHOffM.; SUB=_2A25zdOF8DeRhGeBP6VcW8SjLzDWIHXVQAFW0rDV8PUNbmtANLUr-kW9NRWCbkhivb5UzMh1zGT7KgW6D-dSnnHFj; SUHB=0sqBpKsKWKrq8Z; un=18595757685; wvr=6; wb_view_log_6125716779=1920*10801; webim_unReadCount={"time":1584440603192,"dm_pub_total":0,"chat_group_client":0,"allcountNum":3,"msgbox":0}; TC-Page-G0=b993e9b6e353749ed3459e1837a0ae89|1584440608|1584440580";

String c="SINAGLOBAL=2525797642447.1143.1576751690811; UOR=,,login.sina.com.cn; TC-V5-G0=595b7637c272b28fccec3e9d529f251a; SSOLoginState=1585210218; Ugrow-G0=7e0e6b57abe2c2f76f677abd9a9ed65d; wvr=6; _s_tentry=weibo.com; Apache=7211436044072.67.1585211180994; ULV=1585211181930:3:2:1:7211436044072.67.1585211180994:1583116766636; SUB=_2AkMp3ULYf8PxqwJRmfkcyG7la4R0ygjEieKfgbMDJRMxHRl-yT9jqk8GtRB6Al1sKDCUM-bsv44hS2JWofGDBG0WLLhQ; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5gfVwXwLLzATj6ArcV1q7i; TC-Page-G0=62b98c0fc3e291bc0c7511933c1b13ad|1585565168|1585565167";

//String c="SINAGLOBAL=2525797642447.1143.1576751690811; UOR=,,login.sina.com.cn; TC-V5-G0=595b7637c272b28fccec3e9d529f251a; SSOLoginState=1585210218; Ugrow-G0=7e0e6b57abe2c2f76f677abd9a9ed65d; wvr=6; _s_tentry=weibo.com; Apache=7211436044072.67.1585211180994; ULV=1585211181930:3:2:1:7211436044072.67.1585211180994:1583116766636; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWlIp9eUCCs0AXkbTy9zp7x5JpX5KMhUgL.Foqpeo-NeKqNS0.2dJLoIEXLxKqLBonL1h-LxKMLB.2LB-qLxKML1-2L1hBLxKnLBKqL1h2LxKqLB-BLB.zt; ALF=1616895226; SCF=AjcCfB6DUrrZ2fMhnntI_TyQc2JsccpWc3X4bHbuPEpJIndU4aQ389BrJPQyB4i6Qj847pZmhvQfZIHMPwC8ARc.; SUB=_2A25zetktDeRhGeBP6VcW8SjLzDWIHXVQDk3lrDV8PUNbmtAKLWXFkW9NRWCbkkZQxvxIVxXfNU1QVQBfoeucUtmz; SUHB=0LcSPzPnzcU0HI; wb_view_log_6125716779=1920*10801; TC-Page-G0=841d8e04c4761f733a87c822f72195f3|1585363186|1585363180; webim_unReadCount={"time":1585363187658,"dm_pub_total":0,"chat_group_client":0,"allcountNum":36,"msgbox":0}";

//第一重点 微博扒取数据,因为微博用了cookie,所以我们扒取时爬到的是个接近空的网页,里面什么内容也没有,跟电脑上F12看到的完全不一样

httpConn.setRequestProperty("cookie", c);

httpConn.setRequestProperty("charset", "utf-8");

InputStreamReader input = new InputStreamReader(httpConn.getInputStream(), "utf-8");

BufferedReader bufReader = new BufferedReader(input);

String line = "";

StringBuilder contentBuf = new StringBuilder();

while ((line = bufReader.readLine()) != null) {

contentBuf.append(line);

}

String buf = contentBuf.toString();

System.out.println(buf);

document document = Jsoup.parse(buf);

//第二重点 微博上面的数据都是用PHP写的,页面的内容也都是用script渲染上面的,所以我们要获取script中渲染的数据,script渲染数据也是个正常的json数据,所一般能拿到指定的script很重要,微博里面所有的内容并不是由一个script渲染的,

//他由很多的script渲染,所以要想拿到指定的内容就要拿到指定的script,下面看怎么找script。所以找到指定的script很重要。

Elements elements=document.select("script");

for (Element element : elements) {

//解析script

String s1=element.data().split("<script>FM.view")[0];

//一层一层 抽丝剥茧拿到自己的内容

if (s1.contains(""html":"")) {

if(s1.split(""html":"")[0].contains("Pl_Official_Headerv6__1")) {

String content = s1.split(""html":"")[1].replaceAll("(\\t|\\n|\\r)", "").replaceAll("\\"", """).replaceAll("\\/", "/");

content = content.substring(0,content.length() <= 13 ? content.length(): content.length() - 13);

document header = Jsoup.parse(content);

Elements headerphoto= header.getElementsByClass("photo");

Elements username= header.getElementsByClass("username");

String nickName=username.text();

String img_url=headerphoto.attr("src");

// w.setNickname(nickName);

// w.setImg_url(img_url);

System.out.println(nickName);

System.out.println(img_url);

}

if(s1.split(""html":"")[0].contains("Pl_Core_T8CustomTriColumn__3")) {

String content = s1.split(""html":"")[1].replaceAll("(\\t|\\n|\\r)", "").replaceAll("\\"", """).replaceAll("\\/", "/");

content = content.substring(0,content.length() <= 13 ? content.length(): content.length() - 13);

document header = Jsoup.parse(content);

Elements data= header.getElementsByClass("W_f14");

if(data.size()==0) {

data= header.getElementsByClass("W_f16");

}

if(data.size()==0) {

data= header.getElementsByClass("W_f18");

}

String fun=data.get(1).text();

// w.setFan_num(fun);

System.out.println(fun);

}

}

}

}

,
发表评论
0评