朙朙创建于 2020-12-18 00:27

我的需求是获取网站源代码，然后提取内容。当请求微博网址时（https://weibo.com/7081990832/JyLmEmlSa），返回的源代码中文部分全部乱码，尝试过用乱码后HTML源代码去转utf-8无解。不知道是不是和微博默认编码GB2312和【HTTP请求】模块默认编码不一致有关。但我尝试使用Python指定编码请求是能正确获得正确的HTML源编码的，不太懂这块但感觉应该是有联系的。

如果是我所想的这样的话，应该能在【HTTP请求】模块功能上增加能够判断HTML源码编码的条件，

请求——判断HTML源码编码——指定编码返回响应

这样一来能避免除微博以外其他网站的问题，不知道技术上能不能实现。也许大佬有更好的解决办法。

<!DOCTYPE html>
<html>
<head>
    <meta http-equiv="Content-type" content="text/html; charset=gb2312"/>
    <title>Sina Visitor System</title>
</head>
<body>
<span id="message"></span>
<script type="text/javascript" src="/js/visitor/mini_original.js?v=20161116"></script>
<script type="text/javascript">
    window.use_fp = "1" == "1"; // �Ƿ�ɼ��豸ָ�ơ�
    var url = url || {};
    (function () {
        this.l = function (u, c) {
            try {
                var s = document.createElement("script");
                s.type = "text/javascript";
                s[document.all ? "onreadystatechange" : "onload"] = function () {

                    if (document.all && this.readyState != "loaded" && this.readyState != "complete") {
                        return
                    }
                    this[document.all ? "onreadystatechange" : "onload"] = null;
                    this.parentNode.removeChild(this);
                    if (c) {
                        c()
                    }
                };
                s.src = u;
                document.getElementsByTagName("head")[0].appendChild(s)
            } catch (e) {
            }
        };
    }).call(url);

    // ������ڡ�
    wload(function () {

        try {

            var need_restore = "1" == "1"; // �Ƿ��߻ָ�������̡�

            // �����Ҫ�߻ָ�������̣����Դ� cookie ��ȡ�û���ݡ�
            if (!need_restore || !Store.CookieHelper.get("SRF")) {

                // ���ȡʧ���ߴ����ÿ����̡�
                // ����ִ��ʱ����������� 3s��������Ϊ�����
                var error_timeout = window.setTimeout("error_back()", 5000);

                tid.get(function (tid, where, confidence) {
                    // ȡָ��˳����ɣ�������� timeout ��
                    window.clearTimeout(error_timeout);
                    incarnate(tid, where, confidence);
                });
            } else {
                // �û���ݴ��ڣ����Իָ��û���ݡ�
                restore();
            }
        } catch (e) {
            // �����
            error_back();
        }
    });

    // �����ء� �ص�������
    var return_back = function (response) {

        if (response["retcode"] == 20000000) {
            back();
        } else {
            // �����
            error_back(response["msg"]);
        }
    };

    // ��ת�س�ʼ��ַ��
    var back = function() {

        var url = "https://weibo.com/7081990832/JyLmEmlSa";
        if (url != "none") {
            window.location.href = url;
        }
    };

    // ����㲥��
    var cross_domain = function (response) {

        var from = "weibo";
        var entry = "miniblog";
        if (response["retcode"] == 20000000) {

            var crossdomain_host = "login.sina.com.cn";
            if (crossdomain_host != "none") {

                var cross_domain_intr = window.location.protocol + "//" + crossdomain_host + "/visitor/visitor?a=crossdomain&cb=return_back&s=" +
                        encodeURIComponent(response["data"]["sub"]) + "&sp=" + encodeURIComponent(response["data"]["subp"]) + "&from=" + from + "&_rand=" + Math.random() + "&entry=" + entry;
                url.l(cross_domain_intr);
            } else {

                back();
            }
        } else {

            // �����
            error_back(response["msg"]);
        }
    };

    // Ϊ�û�����ÿ���� ��
    var incarnate = function (tid, where, conficence) {

        var gen_conf = "";
        var from = "weibo";
        var incarnate_intr = window.location.protocol + "//" + window.location.host + "/visitor/visitor?a=incarnate&t=" +
                encodeURIComponent(tid) + "&w=" + encodeURIComponent(where) + "&c=" + encodeURIComponent(conficence) +
                "&gc=" + encodeURIComponent(gen_conf) + "&cb=cross_domain&from=" + from + "&_rand=" + Math.random();
        url.l(incarnate_intr);
    };

    // �ָ��û���ʧ����ݡ�
    var restore = function () {

        var from = "weibo";
        var restore_intr = window.location.protocol + "//" + window.location.host +
                "/visitor/visitor?a=restore&cb=restore_back&from=" + from + "&_rand=" + Math.random();

        url.l(restore_intr);
    };

    // ����ָ���ʧ����ݡ�
    var restore_back = function (response) {

        // ��ݻָ��ɹ��߹㲥���̣������ߴ����ÿ����̡�
        if (response["retcode"] == 20000000) {

            var url = "https://weibo.com/7081990832/JyLmEmlSa";
            var alt = response["data"]["alt"];
            var savestate = response["data"]["savestate"];
            if (alt != "") {
                requrl = (url == "none") ? "" : "&url=" + encodeURIComponent(url);
                var params = "entry=sso&alt=" + encodeURIComponent(alt) + "&returntype=META" +
                    "&gateway=1&savestate=" + encodeURIComponent(savestate) + requrl;
                window.location.href = "https://login.sina.com.cn/sso/login.php?" + params;
            } else {

                cross_domain(response);
            }
        } else if(response['retcode'] == 50111261 && isInIframe()) {
            //do nothing
        } else {

            tid.get(function (tid, where, confidence) {
                incarnate(tid, where, confidence);
            });
        }
    };

    // ����������ص�¼ҳ��
    var error_back = function (msg) {

        var url = "https://weibo.com/7081990832/JyLmEmlSa";
        var clientType = "pc";
        if (url != "none") {

            if (url.indexOf("ssovie4c55=0") === -1) {
                url += (((url.indexOf("?") === -1) ? "?" : "&") + "ssovie4c55=0");
            }
            if (clientType == "mobile") {
            	window.location.href = "https://passport.weibo.cn/signin/login?r="+url;
            } else{
            	window.location.href = "https://weibo.com/login.php";
            }
        } else {

            if(document.getElementById("message")) {
                document.getElementById("message").innerHTML = "Error occurred" + (msg ? (": " + msg) : ".");
            }
        }
    };

    var isInIframe = function () {
        try {
            return window.self !== window.top;
        } catch (e) {
            return true;
        }
    };

</script>
</body>
</html>

添加评论

回复内容

CL 2020-12-18 08:10

没有找到很好的办法解决这个问题😒

zetalpha 2020-12-18 10:40

可以用c# 子程序我写了个目前模块里面无法改编码

朙朙 2020-12-18 10:43 :

请问怎么使用？

zetalpha 回复朙朙 2020-12-18 10:45 :

输入链接输出文本就是源码

zetalpha 回复朙朙 2020-12-18 10:46 :

https://getquicker.net/subprogram?id=961ab72b-afb5-4fd3-023f-08d86f4ce16b

朙朙 2020-12-18 10:55 :

刚试了下，返回的源码不对。再次打开网站已经无法打开了。

414 Request-URI Too Large

Sorry for the inconvenience.
Please report this message and include the following information to us.
Thank you very much!

URL:	http://weibo.com:8150/6108276212/JyLkwe0Ip?type=comment
Server:	mapi-bypass-yf-5799f4c84b-n7k2q
Date:	2020/12/18 10:54:14

zetalpha 回复朙朙 2020-12-18 11:17 :

你这个是啥地址怎么还带端口号？

朙朙 2020-12-18 11:19 :

微博啊。你试试这个
https://weibo.com/6108276212/JyLkwe0Ip

zetalpha 回复朙朙 2020-12-18 20:26 :

获取不了新浪微博这类会验证账号你发出来的脚本检查内容并没网页源码

朙朙 2020-12-18 21:07 :

（T_T)是我疏忽了，没注意脚本返回的内容，感谢。微博、今日头条之类的不好获取源码，而且麻烦，我只能通过笨办法，用浏览器打开在获取源码。

zetalpha 2020-12-18 20:29

关于【HTTP请求】模块返回的文本结果乱码问题

回复内容

414 Request-URI Too Large

回复主贴