392 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
		
		
			
		
	
	
			392 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
|  | package com.novelbook.android.utils; | |||
|  | 
 | |||
|  | 
 | |||
|  | import android.text.TextUtils; | |||
|  | import android.util.Log; | |||
|  | 
 | |||
|  | import com.novelbook.android.db.Chapter; | |||
|  | import com.novelbook.android.db.SiteRule; | |||
|  | import com.novelbook.android.netutils.HttpMethods; | |||
|  | import com.novelbook.android.netutils.NetUtil; | |||
|  | 
 | |||
|  | import org.json.JSONArray; | |||
|  | import org.json.JSONException; | |||
|  | import org.json.JSONObject; | |||
|  | 
 | |||
|  | import java.io.IOException; | |||
|  | 
 | |||
|  | import java.io.UnsupportedEncodingException; | |||
|  | import java.nio.charset.Charset; | |||
|  | import java.util.ArrayList; | |||
|  | import java.util.Arrays; | |||
|  | import java.util.Date; | |||
|  | import java.util.HashMap; | |||
|  | import java.util.LinkedHashMap; | |||
|  | import java.util.List; | |||
|  | import java.util.Map; | |||
|  | import java.util.Random; | |||
|  | import java.util.Set; | |||
|  | 
 | |||
|  | import okhttp3.Request; | |||
|  | import okhttp3.Response; | |||
|  | import okhttp3.ResponseBody; | |||
|  | 
 | |||
|  | public class NovelParseUtil { | |||
|  |     private static final String TAG=NovelParseUtil.class.getSimpleName(); | |||
|  |     private static final String A_Regex =Constants.A_Regex ;//"<a[^>]+href[\\s]*=[\\s]*['\"]?([^'\"]+)['\"\\s]?[^>]*>([^<]+)<"; | |||
|  |     public static String[] getChaptersArray(String muluUrl, String html, JSONObject siteJson) throws JSONException { | |||
|  | 
 | |||
|  |         Map<String, String> muluMap = getChaptersMap(muluUrl,   html,   siteJson); | |||
|  | 
 | |||
|  | 
 | |||
|  |         String[] values = new String[muluMap.size() * 2]; | |||
|  | 
 | |||
|  |         Set<Map.Entry<String, String>> es = muluMap.entrySet(); | |||
|  |         int pos = values.length - 2; | |||
|  |         for (Map.Entry<String, String> e : es) { | |||
|  |             values[pos] = e.getKey(); | |||
|  |             values[pos + 1] = e.getValue(); | |||
|  |             pos -= 2; | |||
|  |         } | |||
|  |         return values; | |||
|  |     } | |||
|  |     public static List<Chapter>  getChapters(String domain,String muluUrl, String html, JSONObject siteJson) throws JSONException { | |||
|  | 
 | |||
|  |         Map<String, String> muluMap = getChaptersMap(muluUrl,   html,   siteJson); | |||
|  |         if(muluMap==null){ | |||
|  |             return new ArrayList<Chapter>(); | |||
|  |         } | |||
|  |         Chapter[] tmp = new Chapter[muluMap.size()]; | |||
|  | 
 | |||
|  | 
 | |||
|  |         Set<Map.Entry<String, String>> es = muluMap.entrySet(); | |||
|  |         int pos = tmp.length - 1; | |||
|  |         for (Map.Entry<String, String> e : es) { | |||
|  | 
 | |||
|  |             Chapter chapter = new Chapter(); | |||
|  |             chapter.setChapterUrl( e.getKey()); | |||
|  |             chapter.setChapterName( e.getValue()); | |||
|  |             chapter.setDomain(domain); | |||
|  |             chapter.setIndex(pos+1); //第几章 | |||
|  | 
 | |||
|  |            tmp[pos--] =chapter; | |||
|  |         } | |||
|  |         List<Chapter> values = new ArrayList<Chapter>(Arrays.asList(tmp)); | |||
|  |         return values; | |||
|  |     } | |||
|  |     public static Map<String, String> getChaptersMap(String muluUrl, String html, JSONObject siteJson) throws JSONException { | |||
|  |         String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu"); | |||
|  |         String chapterUrlPattern = siteJson.getString("chapterUrlPattern"); | |||
|  | 
 | |||
|  |         //Log.i(TAG, "getChaptersMap: chapterUrlRegexOnMulu: " + chapterUrlRegexOnMulu); | |||
|  |         Map<String, String> muluMap = new LinkedHashMap<String, String>(); | |||
|  |         String regex = A_Regex; | |||
|  |         if (!isBlank(chapterUrlRegexOnMulu)) { | |||
|  |             regex = chapterUrlRegexOnMulu; | |||
|  |         } | |||
|  |         //Log.i(TAG, "getChaptersMap: regex: " + regex); | |||
|  |         String[] rows = REUtil.matchs(regex, html);; | |||
|  |         if (rows == null || rows.length == 0) return null; | |||
|  |         for (int i = rows.length - 1; i >= 0; i--) { | |||
|  |             String row = rows[i]; | |||
|  |             String[] parts = REUtil.groups(regex, row); | |||
|  |             if (parts == null || parts.length == 0) continue; | |||
|  |             String href = getFullUrl(parts[0], muluUrl); | |||
|  |             if (muluMap.containsKey(href)) continue; | |||
|  |             if (isBlank(REUtil.match(chapterUrlPattern, href))) continue; | |||
|  |             String name = parts[1]; | |||
|  |             muluMap.put(href, name); | |||
|  |         } | |||
|  | 
 | |||
|  |         return  muluMap; | |||
|  | 
 | |||
|  |     } | |||
|  | 
 | |||
|  | 
 | |||
|  | 
 | |||
|  | 
 | |||
|  |     public static String getChapterContent(String html, JSONObject siteJson) throws JSONException  { | |||
|  |         String chapterContentRegex = siteJson.getString("chapterContentRegex"); | |||
|  |         String text = REUtil.group(chapterContentRegex, html, 1); | |||
|  |         if (isBlank(text)) return ""; | |||
|  | 
 | |||
|  |         String chapterContentDumpRegex = siteJson.getString("chapterContentDumpRegex"); | |||
|  |         if (!isBlank(chapterContentDumpRegex)) { | |||
|  |             text = text.replaceAll(chapterContentDumpRegex, ""); | |||
|  |         } | |||
|  |         text = text.replaceAll("<![^>]+?>", ""); | |||
|  |         // p br --> \n | |||
|  |         text = text.replaceAll("(?i)<[/]?[\\s]*p[^>]*>|<[/]*br[/ ]*>", "\n"); | |||
|  |         text = text.replaceAll("[']+", "‘"); | |||
|  |         text = text.replaceAll("&#[\\w\\d]+;", ""); | |||
|  | 
 | |||
|  | 
 | |||
|  |         text = text.replaceAll("(?i) ", " "); | |||
|  |         text = text.replaceAll("[ ]{2}", " "); | |||
|  |         text = text.replaceAll("[ ]{3,}", "  "); | |||
|  | 
 | |||
|  |         text = text.replaceAll("<<", "<"); | |||
|  |         text = text.replaceAll("&[\\w\\d]{4};", "").replaceAll("[\\w\\d]{4};", ""); | |||
|  |         text = text.replaceAll("(?i)<script[\\s\\S]+</Script>", "") | |||
|  |                 .replaceAll("<[^>]*>", ""); | |||
|  | 
 | |||
|  |         text = text.replaceAll("\\n[\\s ]+\\n", "\n"); | |||
|  |         if (!text.startsWith(" ")) text = "  " + text; | |||
|  | 
 | |||
|  |         return text.trim(); | |||
|  |     } | |||
|  | 
 | |||
|  | 
 | |||
|  | 
 | |||
|  | 
 | |||
|  | 
 | |||
|  | 
 | |||
|  | 
 | |||
|  | 
 | |||
|  |     public static List<Chapter> getChaptersLst(String[] rows,String domain){ | |||
|  | 
 | |||
|  |         ArrayList<Chapter> lst = new ArrayList<Chapter>(); | |||
|  |        int j=0; | |||
|  |         for (int i=0;i<rows.length;i+=2) { | |||
|  |             j++; | |||
|  |             Chapter chapter = new Chapter(); | |||
|  |             chapter.setChapterUrl( rows[i]); | |||
|  |             chapter.setChapterName(  rows[i+1]); | |||
|  |             chapter.setDomain(domain); | |||
|  |             chapter.setIndex(j); //第几章 | |||
|  |             lst.add(chapter); | |||
|  | 
 | |||
|  |         } | |||
|  |         return lst; | |||
|  |     } | |||
|  | 
 | |||
|  |     public static List<Chapter> getChapters(String url, JSONObject siteJson,String siteName,int maxAage,SiteRule siteRule) throws JSONException { | |||
|  |        return  getChaptersLst(getChapters(url,siteJson,maxAage,siteRule),siteName); | |||
|  |     } | |||
|  | 
 | |||
|  | 
 | |||
|  |     public static String[] getChapters(String url, JSONObject siteJson, int maxAge,SiteRule siteRule) throws JSONException { | |||
|  |         //if (!siteJson.keys().("chapterUrlRegexOnMulu")) return null; | |||
|  |         String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu"); | |||
|  |        // if(TextUtils.isEmpty(chapterUrlRegexOnMulu)) return null; | |||
|  |         if(!siteJson.has("chapterUrlRegexOnMulu")){ | |||
|  |             return null ; | |||
|  |         } | |||
|  |         JSONArray muluArray = siteJson.getJSONArray("chapterUrlRegexOnMulu"); | |||
|  |         if (muluArray == null || muluArray.length()== 0) return null; | |||
|  | 
 | |||
|  |         //Log.i(TAG, "to get chaps muluArray is null: " +( muluArray ==null)  ); | |||
|  | 
 | |||
|  |         Map<String, Object> context = new HashMap<String, Object>(); | |||
|  |         context.put("url", url); | |||
|  |         //Log.i(TAG, "to get chaps url:" + url ); | |||
|  |         List<String> result = new ArrayList<String>(); | |||
|  |         // 最外部的大的规则对象 | |||
|  |         for (int i = 0; i < muluArray.length(); i++) { | |||
|  | 
 | |||
|  |             JSONObject regexsJson = muluArray.getJSONObject(i); | |||
|  |             String source = regexsJson.getString("source"); | |||
|  |             source = getContent(source, context); | |||
|  |             //Log.i(TAG, "to get chaps source:" + source ); | |||
|  |             if (source.startsWith("html:")) { | |||
|  |                 String _url = source.substring("html:".length()); | |||
|  |                 source = access(_url,maxAge,siteRule ); | |||
|  |                 //Log.i(TAG, "to get chaps source:" + source ); | |||
|  |             } | |||
|  | 
 | |||
|  |             // 第一次Regex对象 | |||
|  |             JSONArray regexsArray = regexsJson.getJSONArray("regexs"); | |||
|  |             //Log.i(TAG, "to get chaps regexsArray.length():" + regexsArray.length() ); | |||
|  |             for (int j = 0; j < regexsArray.length(); j++) { | |||
|  |                 JSONObject regexJson = regexsArray.getJSONObject(j); | |||
|  |                 String[] values = null; | |||
|  |                 //Log.i(TAG, "to get chaps regexJson.getBoolean(\"group\"):" + regexJson.getBoolean("group") ); | |||
|  |                 if (regexJson.getBoolean("group")) { | |||
|  |                     values = REUtil.groups(regexJson.getString("regex"), source); | |||
|  |                 } else { | |||
|  |                     values = REUtil.matchs(regexJson.getString("regex"), source); | |||
|  |                 } | |||
|  |                 //Log.i(TAG, "to get chaps values==null? :" + (values==null) ); | |||
|  |                 for(String s:values){ | |||
|  |                     //Log.i(TAG, "to get chaps value  :" + s); | |||
|  |                 } | |||
|  |                 if (values != null) context.put(regexJson.getString("name"), values); | |||
|  | 
 | |||
|  | 
 | |||
|  |                // String child = siteJson.getString("child"); | |||
|  |                 //Log.i(TAG, "to get chaps siteJson.has(\"child\")?  :" + siteJson.has("child")); | |||
|  |                 if (   regexJson.has("child") ) { | |||
|  |                     // 一般用来做返回结果用的 | |||
|  |                     JSONObject childJson = regexJson.getJSONObject("child"); | |||
|  |                     //Log.i(TAG, "to get chaps childJson  :" + childJson.toString()); | |||
|  |                     for (String value : values) { | |||
|  |                         //Log.i(TAG, "to get chaps value in values  :" + value); | |||
|  |                         String[] values2 = null; | |||
|  |                         //Log.i(TAG, "to get chaps childJson.getBoolean(\"group\")  :" + childJson.getBoolean("group")); | |||
|  |                         if (childJson.getBoolean("group")) { | |||
|  |                             values2 = REUtil.groups(childJson.getString("regex"), value); | |||
|  |                         } else { | |||
|  |                             values2 = REUtil.matchs(childJson.getString("regex"), value); | |||
|  |                         } | |||
|  |                         //Log.i(TAG, "to get chaps values2 != null ? :" + (values2 != null)); | |||
|  |                         if (values2 != null) { | |||
|  |                             //Log.i(TAG, "to get chaps value in values2  :" + value); | |||
|  |                             context.put(childJson.getString("name"), values2); | |||
|  |                             //Log.i(TAG, "to get chaps siteJson.has(\"output\")  :" + siteJson.has("output")); | |||
|  |                             if (childJson.has("output")) { | |||
|  |                                 JSONArray outputArray = childJson.getJSONArray("output"); | |||
|  |                                 for (int m = 0; m < outputArray.length(); m++) { | |||
|  |                                     String v = getContent(outputArray.getString(m), context); | |||
|  |                                     //Log.i(TAG, "to get chaps v  :" + v); | |||
|  |                                     if (v != null) result.add(v); | |||
|  |                                 } | |||
|  |                             } | |||
|  |                         } | |||
|  |                     } | |||
|  |                 } | |||
|  |             } | |||
|  |         } | |||
|  |         //Log.i(TAG, "to get chaps <---------------------------- result size  :" + result.size()); | |||
|  |         return result.toArray(new String[0]); | |||
|  |     } | |||
|  | 
 | |||
|  |     private static String getContent(String var, Map<String, Object> context) { | |||
|  |         String[] vs = REUtil.matchs("\\{[^\\}]+?\\}", var); | |||
|  |         for (String v : vs) { | |||
|  |             String vn = v.substring(1, v.length() - 1); | |||
|  |             String value = ""; | |||
|  |             if (vn.endsWith("]")) { | |||
|  |                 int pos = vn.indexOf("["); | |||
|  |                 if (pos == -1) continue; | |||
|  |                 int index =Integer.parseInt(vn.substring(pos + 1, vn.length() - 1)); | |||
|  |                 Object ov = context.get(vn.substring(0, pos)); | |||
|  |                 if (ov == null) continue; | |||
|  |                 if (ov.getClass().isArray()) { | |||
|  |                     String[] ovs = (String[])ov; | |||
|  |                     if (index >= ovs.length) continue; | |||
|  |                     value = ovs[index]; | |||
|  |                 } | |||
|  |             } else { | |||
|  |                 if (context.get(vn) == null) continue; | |||
|  |                 value = context.get(vn).toString(); | |||
|  |             } | |||
|  | 
 | |||
|  |             int pos = var.indexOf(v); | |||
|  |             var = var.substring(0,  pos) + value + var.substring(pos + vn.length() + 2); | |||
|  |         } | |||
|  | 
 | |||
|  |         return var; | |||
|  |     } | |||
|  | 
 | |||
|  | 
 | |||
|  |     private static String access(String url,int maxAge, SiteRule siteRule) { | |||
|  |         if(TextUtils.isEmpty(url) || !url.toLowerCase().startsWith("http") || !url.toLowerCase().startsWith("https")){ | |||
|  |             return  ""; | |||
|  |         } | |||
|  |         Request.Builder builder = new Request.Builder() | |||
|  |             //    .tag(mNovel.getNovelId()) //标记 请求的tag,切换小说或离开小说界面(BookActivity) 时 取消未执行完毕的 此tag的所有请求 | |||
|  |                 .url(url) | |||
|  |                 .removeHeader("Pragma") | |||
|  |                 .header("Cache-Control", "public, max-age=" + maxAge) | |||
|  |                 //  .header("Accept-Encoding","gzip, deflate, sdch") | |||
|  |              //   .header("Accept-Language","zh-CN,zh;q=0.9") | |||
|  |                // .header( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") | |||
|  |              //   .header( "Upgrade-Insecure-Requests", "1") | |||
|  |                // .header("content-type", "text/html; charset=utf-8") | |||
|  |                 ; | |||
|  |         for(int i=0;i<siteRule.getHeaders().length;i+=2){ | |||
|  |             builder.header(siteRule.getHeaders()[i],siteRule.getHeaders()[i+1]); | |||
|  | 
 | |||
|  |         } | |||
|  |         if(siteRule.getUserAgents()!=null && siteRule.getUserAgents().length>0){ | |||
|  | 
 | |||
|  |             builder.removeHeader("User-Agent").addHeader("User-Agent", siteRule.getUserAgents()[new Random().nextInt( siteRule.getUserAgents().length-1)]); //加 随机agent | |||
|  | 
 | |||
|  |         }else{ | |||
|  |             builder.removeHeader("User-Agent").addHeader("User-Agent",  HttpMethods.USERAGENT); | |||
|  |         } | |||
|  |         Request request =builder.build() ; | |||
|  |         Response response = null; | |||
|  |         try { | |||
|  |             response =  HttpMethods.getOkClient().newCall(request).execute(); | |||
|  |             //String s =response.body().string(); | |||
|  | 
 | |||
|  |             String s = enconding(response.body(),siteRule.getEncoding()); //new String(response.body().bytes(),  encoding); | |||
|  |          //   response.body().close(); | |||
|  |             long st = new java.util.Date().getTime(); | |||
|  |             //Log.i(TAG, "to get chaps access result:" + s ); | |||
|  |             return  s; | |||
|  |           //  return enconding(s,encoding); | |||
|  |          //  return info; | |||
|  |         } catch (IOException e) { | |||
|  |             e.printStackTrace(); | |||
|  |             Log.e(TAG, "access: ", e); | |||
|  |         }finally { | |||
|  |             if(response!=null) | |||
|  |                 response.body().close(); | |||
|  |         } | |||
|  |         return ""; | |||
|  | 
 | |||
|  |     } | |||
|  | 
 | |||
|  | public  static String enconding(ResponseBody body,  String encode) throws UnsupportedEncodingException { | |||
|  |         String s=""; | |||
|  |         try{ | |||
|  |             Charset charset = body.contentType().charset(); | |||
|  |             if(charset!=null){ | |||
|  |                 s= body.string(); | |||
|  |             }else { | |||
|  |                 s= new String(body.bytes(), encode); | |||
|  |             } | |||
|  |         }catch (Exception er){ | |||
|  | 
 | |||
|  |         }finally { | |||
|  |             body.close(); | |||
|  |         } | |||
|  |         return s; | |||
|  | 
 | |||
|  | } | |||
|  | 
 | |||
|  | 
 | |||
|  | 
 | |||
|  |     private static boolean isBlank(String value) { | |||
|  |         return value == null || "".equals(value); | |||
|  |     } | |||
|  | 
 | |||
|  |     public static String getFullUrl(String url, String referer) { | |||
|  |         if ( isBlank(referer) ) { | |||
|  |             if (url.startsWith("//")) { | |||
|  |                 return "http:" + url; | |||
|  |             } else { | |||
|  |                 return url; | |||
|  |             } | |||
|  |         } else if (url.startsWith("//")) { | |||
|  |             if (referer.toLowerCase().startsWith("https:")) { | |||
|  |                 return "https:" + url; | |||
|  |             } else { | |||
|  |                 return "http:" + url; | |||
|  |             } | |||
|  |         } | |||
|  | 
 | |||
|  |         String lowerCaseUrl = url.toLowerCase(); | |||
|  |         if (lowerCaseUrl.startsWith("http://") || lowerCaseUrl.startsWith("https://")) { | |||
|  |             return url; | |||
|  |         } | |||
|  |         if (url.startsWith("/")) { | |||
|  |             int index = referer.indexOf("/",  8); | |||
|  |             String host =  referer; | |||
|  |             if (index > -1) { | |||
|  |                 host = referer.substring(0,  index); | |||
|  |             } | |||
|  |             return host + url; | |||
|  |         } else { | |||
|  |             int index = referer.lastIndexOf("/"); | |||
|  |             String prefix = referer; | |||
|  |             if (index > 7) { | |||
|  |                 prefix =  referer.substring(0,  index); | |||
|  |             } | |||
|  |             return prefix + "/" + url; | |||
|  |         } | |||
|  |     } | |||
|  | } |