392 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
			
		
		
	
	
			392 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
| package com.novelbook.android.utils;
 | ||
| 
 | ||
| 
 | ||
| import android.text.TextUtils;
 | ||
| import android.util.Log;
 | ||
| 
 | ||
| import com.novelbook.android.db.Chapter;
 | ||
| import com.novelbook.android.db.SiteRule;
 | ||
| import com.novelbook.android.netutils.HttpMethods;
 | ||
| import com.novelbook.android.netutils.NetUtil;
 | ||
| 
 | ||
| import org.json.JSONArray;
 | ||
| import org.json.JSONException;
 | ||
| import org.json.JSONObject;
 | ||
| 
 | ||
| import java.io.IOException;
 | ||
| 
 | ||
| import java.io.UnsupportedEncodingException;
 | ||
| import java.nio.charset.Charset;
 | ||
| import java.util.ArrayList;
 | ||
| import java.util.Arrays;
 | ||
| import java.util.Date;
 | ||
| import java.util.HashMap;
 | ||
| import java.util.LinkedHashMap;
 | ||
| import java.util.List;
 | ||
| import java.util.Map;
 | ||
| import java.util.Random;
 | ||
| import java.util.Set;
 | ||
| 
 | ||
| import okhttp3.Request;
 | ||
| import okhttp3.Response;
 | ||
| import okhttp3.ResponseBody;
 | ||
| 
 | ||
| public class NovelParseUtil {
 | ||
|     private static final String TAG=NovelParseUtil.class.getSimpleName();
 | ||
|     private static final String A_Regex =Constants.A_Regex ;//"<a[^>]+href[\\s]*=[\\s]*['\"]?([^'\"]+)['\"\\s]?[^>]*>([^<]+)<";
 | ||
|     public static String[] getChaptersArray(String muluUrl, String html, JSONObject siteJson) throws JSONException {
 | ||
| 
 | ||
|         Map<String, String> muluMap = getChaptersMap(muluUrl,   html,   siteJson);
 | ||
| 
 | ||
| 
 | ||
|         String[] values = new String[muluMap.size() * 2];
 | ||
| 
 | ||
|         Set<Map.Entry<String, String>> es = muluMap.entrySet();
 | ||
|         int pos = values.length - 2;
 | ||
|         for (Map.Entry<String, String> e : es) {
 | ||
|             values[pos] = e.getKey();
 | ||
|             values[pos + 1] = e.getValue();
 | ||
|             pos -= 2;
 | ||
|         }
 | ||
|         return values;
 | ||
|     }
 | ||
|     public static List<Chapter>  getChapters(String domain,String muluUrl, String html, JSONObject siteJson) throws JSONException {
 | ||
| 
 | ||
|         Map<String, String> muluMap = getChaptersMap(muluUrl,   html,   siteJson);
 | ||
|         if(muluMap==null){
 | ||
|             return new ArrayList<Chapter>();
 | ||
|         }
 | ||
|         Chapter[] tmp = new Chapter[muluMap.size()];
 | ||
| 
 | ||
| 
 | ||
|         Set<Map.Entry<String, String>> es = muluMap.entrySet();
 | ||
|         int pos = tmp.length - 1;
 | ||
|         for (Map.Entry<String, String> e : es) {
 | ||
| 
 | ||
|             Chapter chapter = new Chapter();
 | ||
|             chapter.setChapterUrl( e.getKey());
 | ||
|             chapter.setChapterName( e.getValue());
 | ||
|             chapter.setDomain(domain);
 | ||
|             chapter.setIndex(pos+1); //第几章
 | ||
| 
 | ||
|            tmp[pos--] =chapter;
 | ||
|         }
 | ||
|         List<Chapter> values = new ArrayList<Chapter>(Arrays.asList(tmp));
 | ||
|         return values;
 | ||
|     }
 | ||
|     public static Map<String, String> getChaptersMap(String muluUrl, String html, JSONObject siteJson) throws JSONException {
 | ||
|         String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu");
 | ||
|         String chapterUrlPattern = siteJson.getString("chapterUrlPattern");
 | ||
| 
 | ||
|         //Log.i(TAG, "getChaptersMap: chapterUrlRegexOnMulu: " + chapterUrlRegexOnMulu);
 | ||
|         Map<String, String> muluMap = new LinkedHashMap<String, String>();
 | ||
|         String regex = A_Regex;
 | ||
|         if (!isBlank(chapterUrlRegexOnMulu)) {
 | ||
|             regex = chapterUrlRegexOnMulu;
 | ||
|         }
 | ||
|         //Log.i(TAG, "getChaptersMap: regex: " + regex);
 | ||
|         String[] rows = REUtil.matchs(regex, html);;
 | ||
|         if (rows == null || rows.length == 0) return null;
 | ||
|         for (int i = rows.length - 1; i >= 0; i--) {
 | ||
|             String row = rows[i];
 | ||
|             String[] parts = REUtil.groups(regex, row);
 | ||
|             if (parts == null || parts.length == 0) continue;
 | ||
|             String href = getFullUrl(parts[0], muluUrl);
 | ||
|             if (muluMap.containsKey(href)) continue;
 | ||
|             if (isBlank(REUtil.match(chapterUrlPattern, href))) continue;
 | ||
|             String name = parts[1];
 | ||
|             muluMap.put(href, name);
 | ||
|         }
 | ||
| 
 | ||
|         return  muluMap;
 | ||
| 
 | ||
|     }
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
|     public static String getChapterContent(String html, JSONObject siteJson) throws JSONException  {
 | ||
|         String chapterContentRegex = siteJson.getString("chapterContentRegex");
 | ||
|         String text = REUtil.group(chapterContentRegex, html, 1);
 | ||
|         if (isBlank(text)) return "";
 | ||
| 
 | ||
|         String chapterContentDumpRegex = siteJson.getString("chapterContentDumpRegex");
 | ||
|         if (!isBlank(chapterContentDumpRegex)) {
 | ||
|             text = text.replaceAll(chapterContentDumpRegex, "");
 | ||
|         }
 | ||
|         text = text.replaceAll("<![^>]+?>", "");
 | ||
|         // p br --> \n
 | ||
|         text = text.replaceAll("(?i)<[/]?[\\s]*p[^>]*>|<[/]*br[/ ]*>", "\n");
 | ||
|         text = text.replaceAll("[']+", "‘");
 | ||
|         text = text.replaceAll("&#[\\w\\d]+;", "");
 | ||
| 
 | ||
| 
 | ||
|         text = text.replaceAll("(?i) ", " ");
 | ||
|         text = text.replaceAll("[ ]{2}", " ");
 | ||
|         text = text.replaceAll("[ ]{3,}", "  ");
 | ||
| 
 | ||
|         text = text.replaceAll("<<", "<");
 | ||
|         text = text.replaceAll("&[\\w\\d]{4};", "").replaceAll("[\\w\\d]{4};", "");
 | ||
|         text = text.replaceAll("(?i)<script[\\s\\S]+</Script>", "")
 | ||
|                 .replaceAll("<[^>]*>", "");
 | ||
| 
 | ||
|         text = text.replaceAll("\\n[\\s ]+\\n", "\n");
 | ||
|         if (!text.startsWith(" ")) text = "  " + text;
 | ||
| 
 | ||
|         return text.trim();
 | ||
|     }
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
|     public static List<Chapter> getChaptersLst(String[] rows,String domain){
 | ||
| 
 | ||
|         ArrayList<Chapter> lst = new ArrayList<Chapter>();
 | ||
|        int j=0;
 | ||
|         for (int i=0;i<rows.length;i+=2) {
 | ||
|             j++;
 | ||
|             Chapter chapter = new Chapter();
 | ||
|             chapter.setChapterUrl( rows[i]);
 | ||
|             chapter.setChapterName(  rows[i+1]);
 | ||
|             chapter.setDomain(domain);
 | ||
|             chapter.setIndex(j); //第几章
 | ||
|             lst.add(chapter);
 | ||
| 
 | ||
|         }
 | ||
|         return lst;
 | ||
|     }
 | ||
| 
 | ||
|     public static List<Chapter> getChapters(String url, JSONObject siteJson,String siteName,int maxAage,SiteRule siteRule) throws JSONException {
 | ||
|        return  getChaptersLst(getChapters(url,siteJson,maxAage,siteRule),siteName);
 | ||
|     }
 | ||
| 
 | ||
| 
 | ||
|     public static String[] getChapters(String url, JSONObject siteJson, int maxAge,SiteRule siteRule) throws JSONException {
 | ||
|         //if (!siteJson.keys().("chapterUrlRegexOnMulu")) return null;
 | ||
|         String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu");
 | ||
|        // if(TextUtils.isEmpty(chapterUrlRegexOnMulu)) return null;
 | ||
|         if(!siteJson.has("chapterUrlRegexOnMulu")){
 | ||
|             return null ;
 | ||
|         }
 | ||
|         JSONArray muluArray = siteJson.getJSONArray("chapterUrlRegexOnMulu");
 | ||
|         if (muluArray == null || muluArray.length()== 0) return null;
 | ||
| 
 | ||
|         //Log.i(TAG, "to get chaps muluArray is null: " +( muluArray ==null)  );
 | ||
| 
 | ||
|         Map<String, Object> context = new HashMap<String, Object>();
 | ||
|         context.put("url", url);
 | ||
|         //Log.i(TAG, "to get chaps url:" + url );
 | ||
|         List<String> result = new ArrayList<String>();
 | ||
|         // 最外部的大的规则对象
 | ||
|         for (int i = 0; i < muluArray.length(); i++) {
 | ||
| 
 | ||
|             JSONObject regexsJson = muluArray.getJSONObject(i);
 | ||
|             String source = regexsJson.getString("source");
 | ||
|             source = getContent(source, context);
 | ||
|             //Log.i(TAG, "to get chaps source:" + source );
 | ||
|             if (source.startsWith("html:")) {
 | ||
|                 String _url = source.substring("html:".length());
 | ||
|                 source = access(_url,maxAge,siteRule );
 | ||
|                 //Log.i(TAG, "to get chaps source:" + source );
 | ||
|             }
 | ||
| 
 | ||
|             // 第一次Regex对象
 | ||
|             JSONArray regexsArray = regexsJson.getJSONArray("regexs");
 | ||
|             //Log.i(TAG, "to get chaps regexsArray.length():" + regexsArray.length() );
 | ||
|             for (int j = 0; j < regexsArray.length(); j++) {
 | ||
|                 JSONObject regexJson = regexsArray.getJSONObject(j);
 | ||
|                 String[] values = null;
 | ||
|                 //Log.i(TAG, "to get chaps regexJson.getBoolean(\"group\"):" + regexJson.getBoolean("group") );
 | ||
|                 if (regexJson.getBoolean("group")) {
 | ||
|                     values = REUtil.groups(regexJson.getString("regex"), source);
 | ||
|                 } else {
 | ||
|                     values = REUtil.matchs(regexJson.getString("regex"), source);
 | ||
|                 }
 | ||
|                 //Log.i(TAG, "to get chaps values==null? :" + (values==null) );
 | ||
|                 for(String s:values){
 | ||
|                     //Log.i(TAG, "to get chaps value  :" + s);
 | ||
|                 }
 | ||
|                 if (values != null) context.put(regexJson.getString("name"), values);
 | ||
| 
 | ||
| 
 | ||
|                // String child = siteJson.getString("child");
 | ||
|                 //Log.i(TAG, "to get chaps siteJson.has(\"child\")?  :" + siteJson.has("child"));
 | ||
|                 if (   regexJson.has("child") ) {
 | ||
|                     // 一般用来做返回结果用的
 | ||
|                     JSONObject childJson = regexJson.getJSONObject("child");
 | ||
|                     //Log.i(TAG, "to get chaps childJson  :" + childJson.toString());
 | ||
|                     for (String value : values) {
 | ||
|                         //Log.i(TAG, "to get chaps value in values  :" + value);
 | ||
|                         String[] values2 = null;
 | ||
|                         //Log.i(TAG, "to get chaps childJson.getBoolean(\"group\")  :" + childJson.getBoolean("group"));
 | ||
|                         if (childJson.getBoolean("group")) {
 | ||
|                             values2 = REUtil.groups(childJson.getString("regex"), value);
 | ||
|                         } else {
 | ||
|                             values2 = REUtil.matchs(childJson.getString("regex"), value);
 | ||
|                         }
 | ||
|                         //Log.i(TAG, "to get chaps values2 != null ? :" + (values2 != null));
 | ||
|                         if (values2 != null) {
 | ||
|                             //Log.i(TAG, "to get chaps value in values2  :" + value);
 | ||
|                             context.put(childJson.getString("name"), values2);
 | ||
|                             //Log.i(TAG, "to get chaps siteJson.has(\"output\")  :" + siteJson.has("output"));
 | ||
|                             if (childJson.has("output")) {
 | ||
|                                 JSONArray outputArray = childJson.getJSONArray("output");
 | ||
|                                 for (int m = 0; m < outputArray.length(); m++) {
 | ||
|                                     String v = getContent(outputArray.getString(m), context);
 | ||
|                                     //Log.i(TAG, "to get chaps v  :" + v);
 | ||
|                                     if (v != null) result.add(v);
 | ||
|                                 }
 | ||
|                             }
 | ||
|                         }
 | ||
|                     }
 | ||
|                 }
 | ||
|             }
 | ||
|         }
 | ||
|         //Log.i(TAG, "to get chaps <---------------------------- result size  :" + result.size());
 | ||
|         return result.toArray(new String[0]);
 | ||
|     }
 | ||
| 
 | ||
|     private static String getContent(String var, Map<String, Object> context) {
 | ||
|         String[] vs = REUtil.matchs("\\{[^\\}]+?\\}", var);
 | ||
|         for (String v : vs) {
 | ||
|             String vn = v.substring(1, v.length() - 1);
 | ||
|             String value = "";
 | ||
|             if (vn.endsWith("]")) {
 | ||
|                 int pos = vn.indexOf("[");
 | ||
|                 if (pos == -1) continue;
 | ||
|                 int index =Integer.parseInt(vn.substring(pos + 1, vn.length() - 1));
 | ||
|                 Object ov = context.get(vn.substring(0, pos));
 | ||
|                 if (ov == null) continue;
 | ||
|                 if (ov.getClass().isArray()) {
 | ||
|                     String[] ovs = (String[])ov;
 | ||
|                     if (index >= ovs.length) continue;
 | ||
|                     value = ovs[index];
 | ||
|                 }
 | ||
|             } else {
 | ||
|                 if (context.get(vn) == null) continue;
 | ||
|                 value = context.get(vn).toString();
 | ||
|             }
 | ||
| 
 | ||
|             int pos = var.indexOf(v);
 | ||
|             var = var.substring(0,  pos) + value + var.substring(pos + vn.length() + 2);
 | ||
|         }
 | ||
| 
 | ||
|         return var;
 | ||
|     }
 | ||
| 
 | ||
| 
 | ||
|     private static String access(String url,int maxAge, SiteRule siteRule) {
 | ||
|         if(TextUtils.isEmpty(url) || !url.toLowerCase().startsWith("http") || !url.toLowerCase().startsWith("https")){
 | ||
|             return  "";
 | ||
|         }
 | ||
|         Request.Builder builder = new Request.Builder()
 | ||
|             //    .tag(mNovel.getNovelId()) //标记 请求的tag,切换小说或离开小说界面(BookActivity) 时 取消未执行完毕的 此tag的所有请求
 | ||
|                 .url(url)
 | ||
|                 .removeHeader("Pragma")
 | ||
|                 .header("Cache-Control", "public, max-age=" + maxAge)
 | ||
|                 //  .header("Accept-Encoding","gzip, deflate, sdch")
 | ||
|              //   .header("Accept-Language","zh-CN,zh;q=0.9")
 | ||
|                // .header( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
 | ||
|              //   .header( "Upgrade-Insecure-Requests", "1")
 | ||
|                // .header("content-type", "text/html; charset=utf-8")
 | ||
|                 ;
 | ||
|         for(int i=0;i<siteRule.getHeaders().length;i+=2){
 | ||
|             builder.header(siteRule.getHeaders()[i],siteRule.getHeaders()[i+1]);
 | ||
| 
 | ||
|         }
 | ||
|         if(siteRule.getUserAgents()!=null && siteRule.getUserAgents().length>0){
 | ||
| 
 | ||
|             builder.removeHeader("User-Agent").addHeader("User-Agent", siteRule.getUserAgents()[new Random().nextInt( siteRule.getUserAgents().length-1)]); //加 随机agent
 | ||
| 
 | ||
|         }else{
 | ||
|             builder.removeHeader("User-Agent").addHeader("User-Agent",  HttpMethods.USERAGENT);
 | ||
|         }
 | ||
|         Request request =builder.build() ;
 | ||
|         Response response = null;
 | ||
|         try {
 | ||
|             response =  HttpMethods.getOkClient().newCall(request).execute();
 | ||
|             //String s =response.body().string();
 | ||
| 
 | ||
|             String s = enconding(response.body(),siteRule.getEncoding()); //new String(response.body().bytes(),  encoding);
 | ||
|          //   response.body().close();
 | ||
|             long st = new java.util.Date().getTime();
 | ||
|             //Log.i(TAG, "to get chaps access result:" + s );
 | ||
|             return  s;
 | ||
|           //  return enconding(s,encoding);
 | ||
|          //  return info;
 | ||
|         } catch (IOException e) {
 | ||
|             e.printStackTrace();
 | ||
|             Log.e(TAG, "access: ", e);
 | ||
|         }finally {
 | ||
|             if(response!=null)
 | ||
|                 response.body().close();
 | ||
|         }
 | ||
|         return "";
 | ||
| 
 | ||
|     }
 | ||
| 
 | ||
| public  static String enconding(ResponseBody body,  String encode) throws UnsupportedEncodingException {
 | ||
|         String s="";
 | ||
|         try{
 | ||
|             Charset charset = body.contentType().charset();
 | ||
|             if(charset!=null){
 | ||
|                 s= body.string();
 | ||
|             }else {
 | ||
|                 s= new String(body.bytes(), encode);
 | ||
|             }
 | ||
|         }catch (Exception er){
 | ||
| 
 | ||
|         }finally {
 | ||
|             body.close();
 | ||
|         }
 | ||
|         return s;
 | ||
| 
 | ||
| }
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
|     private static boolean isBlank(String value) {
 | ||
|         return value == null || "".equals(value);
 | ||
|     }
 | ||
| 
 | ||
|     public static String getFullUrl(String url, String referer) {
 | ||
|         if ( isBlank(referer) ) {
 | ||
|             if (url.startsWith("//")) {
 | ||
|                 return "http:" + url;
 | ||
|             } else {
 | ||
|                 return url;
 | ||
|             }
 | ||
|         } else if (url.startsWith("//")) {
 | ||
|             if (referer.toLowerCase().startsWith("https:")) {
 | ||
|                 return "https:" + url;
 | ||
|             } else {
 | ||
|                 return "http:" + url;
 | ||
|             }
 | ||
|         }
 | ||
| 
 | ||
|         String lowerCaseUrl = url.toLowerCase();
 | ||
|         if (lowerCaseUrl.startsWith("http://") || lowerCaseUrl.startsWith("https://")) {
 | ||
|             return url;
 | ||
|         }
 | ||
|         if (url.startsWith("/")) {
 | ||
|             int index = referer.indexOf("/",  8);
 | ||
|             String host =  referer;
 | ||
|             if (index > -1) {
 | ||
|                 host = referer.substring(0,  index);
 | ||
|             }
 | ||
|             return host + url;
 | ||
|         } else {
 | ||
|             int index = referer.lastIndexOf("/");
 | ||
|             String prefix = referer;
 | ||
|             if (index > 7) {
 | ||
|                 prefix =  referer.substring(0,  index);
 | ||
|             }
 | ||
|             return prefix + "/" + url;
 | ||
|         }
 | ||
|     }
 | ||
| }
 |