package com.novelbook.android.utils; import android.util.Log; import com.novelbook.android.db.Chapter; import com.novelbook.android.db.SiteRule; import com.novelbook.android.netutils.HttpMethods; import com.novelbook.android.netutils.NetUtil; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import okhttp3.Request; import okhttp3.Response; import okhttp3.ResponseBody; public class NovelParseUtil { private static final String TAG=NovelParseUtil.class.getSimpleName(); private static final String A_Regex =Constants.A_Regex ;//"]+href[\\s]*=[\\s]*['\"]?([^'\"]+)['\"\\s]?[^>]*>([^<]+)<"; public static String[] getChaptersArray(String muluUrl, String html, JSONObject siteJson) throws JSONException { Map muluMap = getChaptersMap(muluUrl, html, siteJson); String[] values = new String[muluMap.size() * 2]; Set> es = muluMap.entrySet(); int pos = values.length - 2; for (Map.Entry e : es) { values[pos] = e.getKey(); values[pos + 1] = e.getValue(); pos -= 2; } return values; } public static List getChapters(String domain,String muluUrl, String html, JSONObject siteJson) throws JSONException { Map muluMap = getChaptersMap(muluUrl, html, siteJson); if(muluMap==null){ return new ArrayList(); } Chapter[] tmp = new Chapter[muluMap.size()]; Set> es = muluMap.entrySet(); int pos = tmp.length - 1; for (Map.Entry e : es) { Chapter chapter = new Chapter(); chapter.setChapterUrl( e.getKey()); chapter.setChapterName( e.getValue()); chapter.setDomain(domain); chapter.setIndex(pos+1); //第几章 tmp[pos--] =chapter; } List values = new ArrayList(Arrays.asList(tmp)); return values; } public static Map getChaptersMap(String muluUrl, String html, JSONObject siteJson) throws JSONException { String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu"); String chapterUrlPattern = siteJson.getString("chapterUrlPattern"); //Log.i(TAG, "getChaptersMap: chapterUrlRegexOnMulu: " + chapterUrlRegexOnMulu); Map muluMap = new LinkedHashMap(); String regex = A_Regex; if (!isBlank(chapterUrlRegexOnMulu)) { regex = chapterUrlRegexOnMulu; } //Log.i(TAG, "getChaptersMap: regex: " + regex); String[] rows = REUtil.matchs(regex, html);; if (rows == null || rows.length == 0) return null; for (int i = rows.length - 1; i >= 0; i--) { String row = rows[i]; String[] parts = REUtil.groups(regex, row); if (parts == null || parts.length == 0) continue; String href = getFullUrl(parts[0], muluUrl); if (muluMap.containsKey(href)) continue; if (isBlank(REUtil.match(chapterUrlPattern, href))) continue; String name = parts[1]; muluMap.put(href, name); } return muluMap; } public static String getChapterContent(String html, JSONObject siteJson) throws JSONException { String chapterContentRegex = siteJson.getString("chapterContentRegex"); String text = REUtil.group(chapterContentRegex, html, 1); if (isBlank(text)) return ""; String chapterContentDumpRegex = siteJson.getString("chapterContentDumpRegex"); if (!isBlank(chapterContentDumpRegex)) { text = text.replaceAll(chapterContentDumpRegex, ""); } text = text.replaceAll("]+?>", ""); // p br --> \n text = text.replaceAll("(?i)<[/]?[\\s]*p[^>]*>|<[/]*br[/ ]*>", "\n"); text = text.replaceAll("[']+", "‘"); text = text.replaceAll("&#[\\w\\d]+;", ""); text = text.replaceAll("(?i) ", " "); text = text.replaceAll("[ ]{2}", " "); text = text.replaceAll("[ ]{3,}", "  "); text = text.replaceAll("<<", "<"); text = text.replaceAll("&[\\w\\d]{4};", "").replaceAll("[\\w\\d]{4};", ""); text = text.replaceAll("(?i)", "") .replaceAll("<[^>]*>", ""); text = text.replaceAll("\\n[\\s ]+\\n", "\n"); if (!text.startsWith(" ")) text = "  " + text; return text.trim(); } public static List getChaptersLst(String[] rows,String domain){ ArrayList lst = new ArrayList(); int j=0; for (int i=0;i getChapters(String url, JSONObject siteJson,String siteName,int maxAage,SiteRule siteRule) throws JSONException { return getChaptersLst(getChapters(url,siteJson,maxAage,siteRule),siteName); } public static String[] getChapters(String url, JSONObject siteJson, int maxAge,SiteRule siteRule) throws JSONException { //if (!siteJson.keys().("chapterUrlRegexOnMulu")) return null; String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu"); // if(TextUtils.isEmpty(chapterUrlRegexOnMulu)) return null; if(!siteJson.has("chapterUrlRegexOnMulu")){ return null ; } JSONArray muluArray = siteJson.getJSONArray("chapterUrlRegexOnMulu"); if (muluArray == null || muluArray.length()== 0) return null; //Log.i(TAG, "to get chaps muluArray is null: " +( muluArray ==null) ); Map context = new HashMap(); context.put("url", url); //Log.i(TAG, "to get chaps url:" + url ); List result = new ArrayList(); // 最外部的大的规则对象 for (int i = 0; i < muluArray.length(); i++) { JSONObject regexsJson = muluArray.getJSONObject(i); String source = regexsJson.getString("source"); source = getContent(source, context); //Log.i(TAG, "to get chaps source:" + source ); if (source.startsWith("html:")) { String _url = source.substring("html:".length()); source = access(_url,maxAge,siteRule ); //Log.i(TAG, "to get chaps source:" + source ); } // 第一次Regex对象 JSONArray regexsArray = regexsJson.getJSONArray("regexs"); //Log.i(TAG, "to get chaps regexsArray.length():" + regexsArray.length() ); for (int j = 0; j < regexsArray.length(); j++) { JSONObject regexJson = regexsArray.getJSONObject(j); String[] values = null; //Log.i(TAG, "to get chaps regexJson.getBoolean(\"group\"):" + regexJson.getBoolean("group") ); if (regexJson.getBoolean("group")) { values = REUtil.groups(regexJson.getString("regex"), source); } else { values = REUtil.matchs(regexJson.getString("regex"), source); } //Log.i(TAG, "to get chaps values==null? :" + (values==null) ); for(String s:values){ //Log.i(TAG, "to get chaps value :" + s); } if (values != null) context.put(regexJson.getString("name"), values); // String child = siteJson.getString("child"); //Log.i(TAG, "to get chaps siteJson.has(\"child\")? :" + siteJson.has("child")); if ( regexJson.has("child") ) { // 一般用来做返回结果用的 JSONObject childJson = regexJson.getJSONObject("child"); //Log.i(TAG, "to get chaps childJson :" + childJson.toString()); for (String value : values) { //Log.i(TAG, "to get chaps value in values :" + value); String[] values2 = null; //Log.i(TAG, "to get chaps childJson.getBoolean(\"group\") :" + childJson.getBoolean("group")); if (childJson.getBoolean("group")) { values2 = REUtil.groups(childJson.getString("regex"), value); } else { values2 = REUtil.matchs(childJson.getString("regex"), value); } //Log.i(TAG, "to get chaps values2 != null ? :" + (values2 != null)); if (values2 != null) { //Log.i(TAG, "to get chaps value in values2 :" + value); context.put(childJson.getString("name"), values2); //Log.i(TAG, "to get chaps siteJson.has(\"output\") :" + siteJson.has("output")); if (childJson.has("output")) { JSONArray outputArray = childJson.getJSONArray("output"); for (int m = 0; m < outputArray.length(); m++) { String v = getContent(outputArray.getString(m), context); //Log.i(TAG, "to get chaps v :" + v); if (v != null) result.add(v); } } } } } } } //Log.i(TAG, "to get chaps <---------------------------- result size :" + result.size()); return result.toArray(new String[0]); } private static String getContent(String var, Map context) { String[] vs = REUtil.matchs("\\{[^\\}]+?\\}", var); for (String v : vs) { String vn = v.substring(1, v.length() - 1); String value = ""; if (vn.endsWith("]")) { int pos = vn.indexOf("["); if (pos == -1) continue; int index =Integer.parseInt(vn.substring(pos + 1, vn.length() - 1)); Object ov = context.get(vn.substring(0, pos)); if (ov == null) continue; if (ov.getClass().isArray()) { String[] ovs = (String[])ov; if (index >= ovs.length) continue; value = ovs[index]; } } else { if (context.get(vn) == null) continue; value = context.get(vn).toString(); } int pos = var.indexOf(v); var = var.substring(0, pos) + value + var.substring(pos + vn.length() + 2); } return var; } private static String access(String url,int maxAge, SiteRule siteRule) { Request.Builder builder = new Request.Builder() // .tag(mNovel.getNovelId()) //标记 请求的tag,切换小说或离开小说界面(BookActivity) 时 取消未执行完毕的 此tag的所有请求 .url(url) .removeHeader("Pragma") .header("Cache-Control", "public, max-age=" + maxAge) // .header("Accept-Encoding","gzip, deflate, sdch") // .header("Accept-Language","zh-CN,zh;q=0.9") // .header( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") // .header( "Upgrade-Insecure-Requests", "1") // .header("content-type", "text/html; charset=utf-8") ; for(int i=0;i0){ builder.removeHeader("User-Agent").addHeader("User-Agent", siteRule.getUserAgents()[new Random().nextInt( siteRule.getUserAgents().length-1)]); //加 随机agent }else{ builder.removeHeader("User-Agent").addHeader("User-Agent", HttpMethods.USERAGENT); } Request request =builder.build() ; Response response = null; try { response = HttpMethods.getOkClient().newCall(request).execute(); //String s =response.body().string(); String s = enconding(response.body(),siteRule.getEncoding()); //new String(response.body().bytes(), encoding); // response.body().close(); long st = new java.util.Date().getTime(); //Log.i(TAG, "to get chaps access result:" + s ); return s; // return enconding(s,encoding); // return info; } catch (IOException e) { e.printStackTrace(); Log.e(TAG, "access: ", e); }finally { if(response!=null) response.body().close(); } return ""; } public static String enconding(ResponseBody body, String encode) throws UnsupportedEncodingException { String s=""; try{ Charset charset = body.contentType().charset(); if(charset!=null){ s= body.string(); }else { s= new String(body.bytes(), encode); } }catch (Exception er){ }finally { body.close(); } return s; } private static boolean isBlank(String value) { return value == null || "".equals(value); } public static String getFullUrl(String url, String referer) { if ( isBlank(referer) ) { if (url.startsWith("//")) { return "http:" + url; } else { return url; } } else if (url.startsWith("//")) { if (referer.toLowerCase().startsWith("https:")) { return "https:" + url; } else { return "http:" + url; } } String lowerCaseUrl = url.toLowerCase(); if (lowerCaseUrl.startsWith("http://") || lowerCaseUrl.startsWith("https://")) { return url; } if (url.startsWith("/")) { int index = referer.indexOf("/", 8); String host = referer; if (index > -1) { host = referer.substring(0, index); } return host + url; } else { int index = referer.lastIndexOf("/"); String prefix = referer; if (index > 7) { prefix = referer.substring(0, index); } return prefix + "/" + url; } } }