我的需求是获取网站源代码,然后提取内容。当请求微博网址时(https://weibo.com/7081990832/JyLmEmlSa),返回的源代码中文部分全部乱码,尝试过用乱码后HTML源代码去转utf-8无解。不知道是不是和微博默认编码GB2312和【HTTP请求】模块默认编码不一致有关。但我尝试使用Python指定编码请求是能正确获得正确的HTML源编码的,不太懂这块但感觉应该是有联系的。
如果是我所想的这样的话,应该能在【HTTP请求】模块功能上增加能够判断HTML源码编码的条件,
请求——判断HTML源码编码——指定编码返回响应
这样一来能避免除微博以外其他网站的问题,不知道技术上能不能实现。也许大佬有更好的解决办法。
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-type" content="text/html; charset=gb2312"/>
<title>Sina Visitor System</title>
</head>
<body>
<span id="message"></span>
<script type="text/javascript" src="/js/visitor/mini_original.js?v=20161116"></script>
<script type="text/javascript">
window.use_fp = "1" == "1"; // �Ƿ�ɼ��豸ָ�ơ�
var url = url || {};
(function () {
this.l = function (u, c) {
try {
var s = document.createElement("script");
s.type = "text/javascript";
s[document.all ? "onreadystatechange" : "onload"] = function () {
if (document.all && this.readyState != "loaded" && this.readyState != "complete") {
return
}
this[document.all ? "onreadystatechange" : "onload"] = null;
this.parentNode.removeChild(this);
if (c) {
c()
}
};
s.src = u;
document.getElementsByTagName("head")[0].appendChild(s)
} catch (e) {
}
};
}).call(url);
// ������ڡ�
wload(function () {
try {
var need_restore = "1" == "1"; // �Ƿ��ָ�������̡�
// �����Ҫ�ָ�������̣����Դ� cookie ��ȡ�û���ݡ�
if (!need_restore || !Store.CookieHelper.get("SRF")) {
// ���ȡʧ���ߴ����ÿ����̡�
// ����ִ��ʱ����������� 3s��������Ϊ�����
var error_timeout = window.setTimeout("error_back()", 5000);
tid.get(function (tid, where, confidence) {
// ȡָ��˳����ɣ�������� timeout ��
window.clearTimeout(error_timeout);
incarnate(tid, where, confidence);
});
} else {
// �û���ݴ��ڣ����Իָ��û���ݡ�
restore();
}
} catch (e) {
// �����
error_back();
}
});
// �����ء� �ص�������
var return_back = function (response) {
if (response["retcode"] == 20000000) {
back();
} else {
// �����
error_back(response["msg"]);
}
};
// ��ת�س�ʼ��ַ��
var back = function() {
var url = "https://weibo.com/7081990832/JyLmEmlSa";
if (url != "none") {
window.location.href = url;
}
};
// ����㲥��
var cross_domain = function (response) {
var from = "weibo";
var entry = "miniblog";
if (response["retcode"] == 20000000) {
var crossdomain_host = "login.sina.com.cn";
if (crossdomain_host != "none") {
var cross_domain_intr = window.location.protocol + "//" + crossdomain_host + "/visitor/visitor?a=crossdomain&cb=return_back&s=" +
encodeURIComponent(response["data"]["sub"]) + "&sp=" + encodeURIComponent(response["data"]["subp"]) + "&from=" + from + "&_rand=" + Math.random() + "&entry=" + entry;
url.l(cross_domain_intr);
} else {
back();
}
} else {
// �����
error_back(response["msg"]);
}
};
// Ϊ�û�����ÿ���� ��
var incarnate = function (tid, where, conficence) {
var gen_conf = "";
var from = "weibo";
var incarnate_intr = window.location.protocol + "//" + window.location.host + "/visitor/visitor?a=incarnate&t=" +
encodeURIComponent(tid) + "&w=" + encodeURIComponent(where) + "&c=" + encodeURIComponent(conficence) +
"&gc=" + encodeURIComponent(gen_conf) + "&cb=cross_domain&from=" + from + "&_rand=" + Math.random();
url.l(incarnate_intr);
};
// �ָ��û���ʧ����ݡ�
var restore = function () {
var from = "weibo";
var restore_intr = window.location.protocol + "//" + window.location.host +
"/visitor/visitor?a=restore&cb=restore_back&from=" + from + "&_rand=" + Math.random();
url.l(restore_intr);
};
// ����ָ���ʧ����ݡ�
var restore_back = function (response) {
// ��ݻָ��ɹ��߹㲥���̣������ߴ����ÿ����̡�
if (response["retcode"] == 20000000) {
var url = "https://weibo.com/7081990832/JyLmEmlSa";
var alt = response["data"]["alt"];
var savestate = response["data"]["savestate"];
if (alt != "") {
requrl = (url == "none") ? "" : "&url=" + encodeURIComponent(url);
var params = "entry=sso&alt=" + encodeURIComponent(alt) + "&returntype=META" +
"&gateway=1&savestate=" + encodeURIComponent(savestate) + requrl;
window.location.href = "https://login.sina.com.cn/sso/login.php?" + params;
} else {
cross_domain(response);
}
} else if(response['retcode'] == 50111261 && isInIframe()) {
//do nothing
} else {
tid.get(function (tid, where, confidence) {
incarnate(tid, where, confidence);
});
}
};
// ����������ص�¼ҳ��
var error_back = function (msg) {
var url = "https://weibo.com/7081990832/JyLmEmlSa";
var clientType = "pc";
if (url != "none") {
if (url.indexOf("ssovie4c55=0") === -1) {
url += (((url.indexOf("?") === -1) ? "?" : "&") + "ssovie4c55=0");
}
if (clientType == "mobile") {
window.location.href = "https://passport.weibo.cn/signin/login?r="+url;
} else{
window.location.href = "https://weibo.com/login.php";
}
} else {
if(document.getElementById("message")) {
document.getElementById("message").innerHTML = "Error occurred" + (msg ? (": " + msg) : ".");
}
}
};
var isInIframe = function () {
try {
return window.self !== window.top;
} catch (e) {
return true;
}
};
</script>
</body>
</html>
请问怎么使用?
刚试了下,返回的源码不对。再次打开网站已经无法打开了。
Sorry for the inconvenience.
Please report this message and include the following information to us.
Thank you very much!
URL: | http://weibo.com:8150/6108276212/JyLkwe0Ip?type=comment |
Server: | mapi-bypass-yf-5799f4c84b-n7k2q |
Date: | 2020/12/18 10:54:14 |
微博啊。你试试这个
https://weibo.com/6108276212/JyLkwe0Ip
(T_T)是我疏忽了,没注意脚本返回的内容,感谢。微博、今日头条之类的不好获取源码,而且麻烦,我只能通过笨办法,用浏览器打开在获取源码。