pda/zhuike/.svn/pristine/88/883e8f7c9b3547f91491cb27cad...

package com.novelbook.android.utils;


import android.text.TextUtils;
import android.util.Log;

import com.novelbook.android.db.Chapter;
import com.novelbook.android.db.SiteRule;
import com.novelbook.android.netutils.HttpMethods;
import com.novelbook.android.netutils.NetUtil;

import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import java.io.IOException;

import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import okhttp3.Request;
import okhttp3.Response;
import okhttp3.ResponseBody;

public class NovelParseUtil {
    private static final String TAG=NovelParseUtil.class.getSimpleName();
    private static final String A_Regex =Constants.A_Regex ;//"<a[^>]+href[\\s]*=[\\s]*['\"]?([^'\"]+)['\"\\s]?[^>]*>([^<]+)<";
    public static String[] getChaptersArray(String muluUrl, String html, JSONObject siteJson) throws JSONException {

        Map<String, String> muluMap = getChaptersMap(muluUrl,   html,   siteJson);


        String[] values = new String[muluMap.size() * 2];

        Set<Map.Entry<String, String>> es = muluMap.entrySet();
        int pos = values.length - 2;
        for (Map.Entry<String, String> e : es) {
            values[pos] = e.getKey();
            values[pos + 1] = e.getValue();
            pos -= 2;
        }
        return values;
    }
    public static List<Chapter>  getChapters(String domain,String muluUrl, String html, JSONObject siteJson) throws JSONException {

        Map<String, String> muluMap = getChaptersMap(muluUrl,   html,   siteJson);
        if(muluMap==null){
            return new ArrayList<Chapter>();
        }
        Chapter[] tmp = new Chapter[muluMap.size()];


        Set<Map.Entry<String, String>> es = muluMap.entrySet();
        int pos = tmp.length - 1;
        for (Map.Entry<String, String> e : es) {

            Chapter chapter = new Chapter();
            chapter.setChapterUrl( e.getKey());
            chapter.setChapterName( e.getValue());
            chapter.setDomain(domain);
            chapter.setIndex(pos+1); //第几章

           tmp[pos--] =chapter;
        }
        List<Chapter> values = new ArrayList<Chapter>(Arrays.asList(tmp));
        return values;
    }
    public static Map<String, String> getChaptersMap(String muluUrl, String html, JSONObject siteJson) throws JSONException {
        String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu");
        String chapterUrlPattern = siteJson.getString("chapterUrlPattern");

        //Log.i(TAG, "getChaptersMap: chapterUrlRegexOnMulu: " + chapterUrlRegexOnMulu);
        Map<String, String> muluMap = new LinkedHashMap<String, String>();
        String regex = A_Regex;
        if (!isBlank(chapterUrlRegexOnMulu)) {
            regex = chapterUrlRegexOnMulu;
        }
        //Log.i(TAG, "getChaptersMap: regex: " + regex);
        String[] rows = REUtil.matchs(regex, html);;
        if (rows == null || rows.length == 0) return null;
        for (int i = rows.length - 1; i >= 0; i--) {
            String row = rows[i];
            String[] parts = REUtil.groups(regex, row);
            if (parts == null || parts.length == 0) continue;
            String href = getFullUrl(parts[0], muluUrl);
            if (muluMap.containsKey(href)) continue;
            if (isBlank(REUtil.match(chapterUrlPattern, href))) continue;
            String name = parts[1];
            muluMap.put(href, name);
        }

        return  muluMap;

    }


    public static String getChapterContent(String html, JSONObject siteJson) throws JSONException  {
        String chapterContentRegex = siteJson.getString("chapterContentRegex");
        String text = REUtil.group(chapterContentRegex, html, 1);
        if (isBlank(text)) return "";

        String chapterContentDumpRegex = siteJson.getString("chapterContentDumpRegex");
        if (!isBlank(chapterContentDumpRegex)) {
            text = text.replaceAll(chapterContentDumpRegex, "");
        }
        text = text.replaceAll("<![^>]+?>", "");
        // p br --> \n
        text = text.replaceAll("(?i)<[/]?[\\s]*p[^>]*>|<[/]*br[/ ]*>", "\n");
        text = text.replaceAll("[']+", "‘");
        text = text.replaceAll("&#[\\w\\d]+;", "");


        text = text.replaceAll("(?i)&nbsp;", " ");
        text = text.replaceAll("[ ]{2}", "　");
        text = text.replaceAll("[　]{3,}", "　　");

        text = text.replaceAll("<<", "<");
        text = text.replaceAll("&[\\w\\d]{4};", "").replaceAll("[\\w\\d]{4};", "");
        text = text.replaceAll("(?i)<script[\\s\\S]+</Script>", "")
                .replaceAll("<[^>]*>", "");

        text = text.replaceAll("\\n[\\s　]+\\n", "\n");
        if (!text.startsWith("　")) text = "　　" + text;

        return text.trim();
    }


    public static List<Chapter> getChaptersLst(String[] rows,String domain){

        ArrayList<Chapter> lst = new ArrayList<Chapter>();
       int j=0;
        for (int i=0;i<rows.length;i+=2) {
            j++;
            Chapter chapter = new Chapter();
            chapter.setChapterUrl( rows[i]);
            chapter.setChapterName(  rows[i+1]);
            chapter.setDomain(domain);
            chapter.setIndex(j); //第几章
            lst.add(chapter);

        }
        return lst;
    }

    public static List<Chapter> getChapters(String url, JSONObject siteJson,String siteName,int maxAage,SiteRule siteRule) throws JSONException {
       return  getChaptersLst(getChapters(url,siteJson,maxAage,siteRule),siteName);
    }


    public static String[] getChapters(String url, JSONObject siteJson, int maxAge,SiteRule siteRule) throws JSONException {
        //if (!siteJson.keys().("chapterUrlRegexOnMulu")) return null;
        String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu");
       // if(TextUtils.isEmpty(chapterUrlRegexOnMulu)) return null;
        if(!siteJson.has("chapterUrlRegexOnMulu")){
            return null ;
        }
        JSONArray muluArray = siteJson.getJSONArray("chapterUrlRegexOnMulu");
        if (muluArray == null || muluArray.length()== 0) return null;

        //Log.i(TAG, "to get chaps muluArray is null: " +( muluArray ==null)  );

        Map<String, Object> context = new HashMap<String, Object>();
        context.put("url", url);
        //Log.i(TAG, "to get chaps url:" + url );
        List<String> result = new ArrayList<String>();
        // 最外部的大的规则对象
        for (int i = 0; i < muluArray.length(); i++) {

            JSONObject regexsJson = muluArray.getJSONObject(i);
            String source = regexsJson.getString("source");
            source = getContent(source, context);
            //Log.i(TAG, "to get chaps source:" + source );
            if (source.startsWith("html:")) {
                String _url = source.substring("html:".length());
                source = access(_url,maxAge,siteRule );
                //Log.i(TAG, "to get chaps source:" + source );
            }

            // 第一次Regex对象
            JSONArray regexsArray = regexsJson.getJSONArray("regexs");
            //Log.i(TAG, "to get chaps regexsArray.length():" + regexsArray.length() );
            for (int j = 0; j < regexsArray.length(); j++) {
                JSONObject regexJson = regexsArray.getJSONObject(j);
                String[] values = null;
                //Log.i(TAG, "to get chaps regexJson.getBoolean(\"group\"):" + regexJson.getBoolean("group") );
                if (regexJson.getBoolean("group")) {
                    values = REUtil.groups(regexJson.getString("regex"), source);
                } else {
                    values = REUtil.matchs(regexJson.getString("regex"), source);
                }
                //Log.i(TAG, "to get chaps values==null? :" + (values==null) );
                for(String s:values){
                    //Log.i(TAG, "to get chaps value  :" + s);
                }
                if (values != null) context.put(regexJson.getString("name"), values);


               // String child = siteJson.getString("child");
                //Log.i(TAG, "to get chaps siteJson.has(\"child\")?  :" + siteJson.has("child"));
                if (   regexJson.has("child") ) {
                    // 一般用来做返回结果用的
                    JSONObject childJson = regexJson.getJSONObject("child");
                    //Log.i(TAG, "to get chaps childJson  :" + childJson.toString());
                    for (String value : values) {
                        //Log.i(TAG, "to get chaps value in values  :" + value);
                        String[] values2 = null;
                        //Log.i(TAG, "to get chaps childJson.getBoolean(\"group\")  :" + childJson.getBoolean("group"));
                        if (childJson.getBoolean("group")) {
                            values2 = REUtil.groups(childJson.getString("regex"), value);
                        } else {
                            values2 = REUtil.matchs(childJson.getString("regex"), value);
                        }
                        //Log.i(TAG, "to get chaps values2 != null ？ :" + (values2 != null));
                        if (values2 != null) {
                            //Log.i(TAG, "to get chaps value in values2  :" + value);
                            context.put(childJson.getString("name"), values2);
                            //Log.i(TAG, "to get chaps siteJson.has(\"output\")  :" + siteJson.has("output"));
                            if (childJson.has("output")) {
                                JSONArray outputArray = childJson.getJSONArray("output");
                                for (int m = 0; m < outputArray.length(); m++) {
                                    String v = getContent(outputArray.getString(m), context);
                                    //Log.i(TAG, "to get chaps v  :" + v);
                                    if (v != null) result.add(v);
                                }
                            }
                        }
                    }
                }
            }
        }
        //Log.i(TAG, "to get chaps <---------------------------- result size  :" + result.size());
        return result.toArray(new String[0]);
    }

    private static String getContent(String var, Map<String, Object> context) {
        String[] vs = REUtil.matchs("\\{[^\\}]+?\\}", var);
        for (String v : vs) {
            String vn = v.substring(1, v.length() - 1);
            String value = "";
            if (vn.endsWith("]")) {
                int pos = vn.indexOf("[");
                if (pos == -1) continue;
                int index =Integer.parseInt(vn.substring(pos + 1, vn.length() - 1));
                Object ov = context.get(vn.substring(0, pos));
                if (ov == null) continue;
                if (ov.getClass().isArray()) {
                    String[] ovs = (String[])ov;
                    if (index >= ovs.length) continue;
                    value = ovs[index];
                }
            } else {
                if (context.get(vn) == null) continue;
                value = context.get(vn).toString();
            }

            int pos = var.indexOf(v);
            var = var.substring(0,  pos) + value + var.substring(pos + vn.length() + 2);
        }

        return var;
    }


    private static String access(String url,int maxAge, SiteRule siteRule) {
        if(TextUtils.isEmpty(url) || !url.toLowerCase().startsWith("http") || !url.toLowerCase().startsWith("https")){
            return  "";
        }
        Request.Builder builder = new Request.Builder()
            //    .tag(mNovel.getNovelId()) //标记 请求的tag,切换小说或离开小说界面(BookActivity) 时 取消未执行完毕的 此tag的所有请求
                .url(url)
                .removeHeader("Pragma")
                .header("Cache-Control", "public, max-age=" + maxAge)
                //  .header("Accept-Encoding","gzip, deflate, sdch")
             //   .header("Accept-Language","zh-CN,zh;q=0.9")
               // .header( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
             //   .header( "Upgrade-Insecure-Requests", "1")
               // .header("content-type", "text/html; charset=utf-8")
                ;
        for(int i=0;i<siteRule.getHeaders().length;i+=2){
            builder.header(siteRule.getHeaders()[i],siteRule.getHeaders()[i+1]);

        }
        if(siteRule.getUserAgents()!=null && siteRule.getUserAgents().length>0){

            builder.removeHeader("User-Agent").addHeader("User-Agent", siteRule.getUserAgents()[new Random().nextInt( siteRule.getUserAgents().length-1)]); //加 随机agent

        }else{
            builder.removeHeader("User-Agent").addHeader("User-Agent",  HttpMethods.USERAGENT);
        }
        Request request =builder.build() ;
        Response response = null;
        try {
            response =  HttpMethods.getOkClient().newCall(request).execute();
            //String s =response.body().string();

            String s = enconding(response.body(),siteRule.getEncoding()); //new String(response.body().bytes(),  encoding);
         //   response.body().close();
            long st = new java.util.Date().getTime();
            //Log.i(TAG, "to get chaps access result:" + s );
            return  s;
          //  return enconding(s,encoding);
         //  return info;
        } catch (IOException e) {
            e.printStackTrace();
            Log.e(TAG, "access: ", e);
        }finally {
            if(response!=null)
                response.body().close();
        }
        return "";

    }

public  static String enconding(ResponseBody body,  String encode) throws UnsupportedEncodingException {
        String s="";
        try{
            Charset charset = body.contentType().charset();
            if(charset!=null){
                s= body.string();
            }else {
                s= new String(body.bytes(), encode);
            }
        }catch (Exception er){

        }finally {
            body.close();
        }
        return s;

}


    private static boolean isBlank(String value) {
        return value == null || "".equals(value);
    }

    public static String getFullUrl(String url, String referer) {
        if ( isBlank(referer) ) {
            if (url.startsWith("//")) {
                return "http:" + url;
            } else {
                return url;
            }
        } else if (url.startsWith("//")) {
            if (referer.toLowerCase().startsWith("https:")) {
                return "https:" + url;
            } else {
                return "http:" + url;
            }
        }

        String lowerCaseUrl = url.toLowerCase();
        if (lowerCaseUrl.startsWith("http://") || lowerCaseUrl.startsWith("https://")) {
            return url;
        }
        if (url.startsWith("/")) {
            int index = referer.indexOf("/",  8);
            String host =  referer;
            if (index > -1) {
                host = referer.substring(0,  index);
            }
            return host + url;
        } else {
            int index = referer.lastIndexOf("/");
            String prefix = referer;
            if (index > 7) {
                prefix =  referer.substring(0,  index);
            }
            return prefix + "/" + url;
        }
    }
}