392 lines
15 KiB
Plaintext
392 lines
15 KiB
Plaintext
package com.novelbook.android.utils;
|
||
|
||
|
||
import android.text.TextUtils;
|
||
import android.util.Log;
|
||
|
||
import com.novelbook.android.db.Chapter;
|
||
import com.novelbook.android.db.SiteRule;
|
||
import com.novelbook.android.netutils.HttpMethods;
|
||
import com.novelbook.android.netutils.NetUtil;
|
||
|
||
import org.json.JSONArray;
|
||
import org.json.JSONException;
|
||
import org.json.JSONObject;
|
||
|
||
import java.io.IOException;
|
||
|
||
import java.io.UnsupportedEncodingException;
|
||
import java.nio.charset.Charset;
|
||
import java.util.ArrayList;
|
||
import java.util.Arrays;
|
||
import java.util.Date;
|
||
import java.util.HashMap;
|
||
import java.util.LinkedHashMap;
|
||
import java.util.List;
|
||
import java.util.Map;
|
||
import java.util.Random;
|
||
import java.util.Set;
|
||
|
||
import okhttp3.Request;
|
||
import okhttp3.Response;
|
||
import okhttp3.ResponseBody;
|
||
|
||
public class NovelParseUtil {
|
||
private static final String TAG=NovelParseUtil.class.getSimpleName();
|
||
private static final String A_Regex =Constants.A_Regex ;//"<a[^>]+href[\\s]*=[\\s]*['\"]?([^'\"]+)['\"\\s]?[^>]*>([^<]+)<";
|
||
public static String[] getChaptersArray(String muluUrl, String html, JSONObject siteJson) throws JSONException {
|
||
|
||
Map<String, String> muluMap = getChaptersMap(muluUrl, html, siteJson);
|
||
|
||
|
||
String[] values = new String[muluMap.size() * 2];
|
||
|
||
Set<Map.Entry<String, String>> es = muluMap.entrySet();
|
||
int pos = values.length - 2;
|
||
for (Map.Entry<String, String> e : es) {
|
||
values[pos] = e.getKey();
|
||
values[pos + 1] = e.getValue();
|
||
pos -= 2;
|
||
}
|
||
return values;
|
||
}
|
||
public static List<Chapter> getChapters(String domain,String muluUrl, String html, JSONObject siteJson) throws JSONException {
|
||
|
||
Map<String, String> muluMap = getChaptersMap(muluUrl, html, siteJson);
|
||
if(muluMap==null){
|
||
return new ArrayList<Chapter>();
|
||
}
|
||
Chapter[] tmp = new Chapter[muluMap.size()];
|
||
|
||
|
||
Set<Map.Entry<String, String>> es = muluMap.entrySet();
|
||
int pos = tmp.length - 1;
|
||
for (Map.Entry<String, String> e : es) {
|
||
|
||
Chapter chapter = new Chapter();
|
||
chapter.setChapterUrl( e.getKey());
|
||
chapter.setChapterName( e.getValue());
|
||
chapter.setDomain(domain);
|
||
chapter.setIndex(pos+1); //第几章
|
||
|
||
tmp[pos--] =chapter;
|
||
}
|
||
List<Chapter> values = new ArrayList<Chapter>(Arrays.asList(tmp));
|
||
return values;
|
||
}
|
||
public static Map<String, String> getChaptersMap(String muluUrl, String html, JSONObject siteJson) throws JSONException {
|
||
String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu");
|
||
String chapterUrlPattern = siteJson.getString("chapterUrlPattern");
|
||
|
||
//Log.i(TAG, "getChaptersMap: chapterUrlRegexOnMulu: " + chapterUrlRegexOnMulu);
|
||
Map<String, String> muluMap = new LinkedHashMap<String, String>();
|
||
String regex = A_Regex;
|
||
if (!isBlank(chapterUrlRegexOnMulu)) {
|
||
regex = chapterUrlRegexOnMulu;
|
||
}
|
||
//Log.i(TAG, "getChaptersMap: regex: " + regex);
|
||
String[] rows = REUtil.matchs(regex, html);;
|
||
if (rows == null || rows.length == 0) return null;
|
||
for (int i = rows.length - 1; i >= 0; i--) {
|
||
String row = rows[i];
|
||
String[] parts = REUtil.groups(regex, row);
|
||
if (parts == null || parts.length == 0) continue;
|
||
String href = getFullUrl(parts[0], muluUrl);
|
||
if (muluMap.containsKey(href)) continue;
|
||
if (isBlank(REUtil.match(chapterUrlPattern, href))) continue;
|
||
String name = parts[1];
|
||
muluMap.put(href, name);
|
||
}
|
||
|
||
return muluMap;
|
||
|
||
}
|
||
|
||
|
||
|
||
|
||
public static String getChapterContent(String html, JSONObject siteJson) throws JSONException {
|
||
String chapterContentRegex = siteJson.getString("chapterContentRegex");
|
||
String text = REUtil.group(chapterContentRegex, html, 1);
|
||
if (isBlank(text)) return "";
|
||
|
||
String chapterContentDumpRegex = siteJson.getString("chapterContentDumpRegex");
|
||
if (!isBlank(chapterContentDumpRegex)) {
|
||
text = text.replaceAll(chapterContentDumpRegex, "");
|
||
}
|
||
text = text.replaceAll("<![^>]+?>", "");
|
||
// p br --> \n
|
||
text = text.replaceAll("(?i)<[/]?[\\s]*p[^>]*>|<[/]*br[/ ]*>", "\n");
|
||
text = text.replaceAll("[']+", "‘");
|
||
text = text.replaceAll("&#[\\w\\d]+;", "");
|
||
|
||
|
||
text = text.replaceAll("(?i) ", " ");
|
||
text = text.replaceAll("[ ]{2}", " ");
|
||
text = text.replaceAll("[ ]{3,}", " ");
|
||
|
||
text = text.replaceAll("<<", "<");
|
||
text = text.replaceAll("&[\\w\\d]{4};", "").replaceAll("[\\w\\d]{4};", "");
|
||
text = text.replaceAll("(?i)<script[\\s\\S]+</Script>", "")
|
||
.replaceAll("<[^>]*>", "");
|
||
|
||
text = text.replaceAll("\\n[\\s ]+\\n", "\n");
|
||
if (!text.startsWith(" ")) text = " " + text;
|
||
|
||
return text.trim();
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
public static List<Chapter> getChaptersLst(String[] rows,String domain){
|
||
|
||
ArrayList<Chapter> lst = new ArrayList<Chapter>();
|
||
int j=0;
|
||
for (int i=0;i<rows.length;i+=2) {
|
||
j++;
|
||
Chapter chapter = new Chapter();
|
||
chapter.setChapterUrl( rows[i]);
|
||
chapter.setChapterName( rows[i+1]);
|
||
chapter.setDomain(domain);
|
||
chapter.setIndex(j); //第几章
|
||
lst.add(chapter);
|
||
|
||
}
|
||
return lst;
|
||
}
|
||
|
||
public static List<Chapter> getChapters(String url, JSONObject siteJson,String siteName,int maxAage,SiteRule siteRule) throws JSONException {
|
||
return getChaptersLst(getChapters(url,siteJson,maxAage,siteRule),siteName);
|
||
}
|
||
|
||
|
||
public static String[] getChapters(String url, JSONObject siteJson, int maxAge,SiteRule siteRule) throws JSONException {
|
||
//if (!siteJson.keys().("chapterUrlRegexOnMulu")) return null;
|
||
String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu");
|
||
// if(TextUtils.isEmpty(chapterUrlRegexOnMulu)) return null;
|
||
if(!siteJson.has("chapterUrlRegexOnMulu")){
|
||
return null ;
|
||
}
|
||
JSONArray muluArray = siteJson.getJSONArray("chapterUrlRegexOnMulu");
|
||
if (muluArray == null || muluArray.length()== 0) return null;
|
||
|
||
//Log.i(TAG, "to get chaps muluArray is null: " +( muluArray ==null) );
|
||
|
||
Map<String, Object> context = new HashMap<String, Object>();
|
||
context.put("url", url);
|
||
//Log.i(TAG, "to get chaps url:" + url );
|
||
List<String> result = new ArrayList<String>();
|
||
// 最外部的大的规则对象
|
||
for (int i = 0; i < muluArray.length(); i++) {
|
||
|
||
JSONObject regexsJson = muluArray.getJSONObject(i);
|
||
String source = regexsJson.getString("source");
|
||
source = getContent(source, context);
|
||
//Log.i(TAG, "to get chaps source:" + source );
|
||
if (source.startsWith("html:")) {
|
||
String _url = source.substring("html:".length());
|
||
source = access(_url,maxAge,siteRule );
|
||
//Log.i(TAG, "to get chaps source:" + source );
|
||
}
|
||
|
||
// 第一次Regex对象
|
||
JSONArray regexsArray = regexsJson.getJSONArray("regexs");
|
||
//Log.i(TAG, "to get chaps regexsArray.length():" + regexsArray.length() );
|
||
for (int j = 0; j < regexsArray.length(); j++) {
|
||
JSONObject regexJson = regexsArray.getJSONObject(j);
|
||
String[] values = null;
|
||
//Log.i(TAG, "to get chaps regexJson.getBoolean(\"group\"):" + regexJson.getBoolean("group") );
|
||
if (regexJson.getBoolean("group")) {
|
||
values = REUtil.groups(regexJson.getString("regex"), source);
|
||
} else {
|
||
values = REUtil.matchs(regexJson.getString("regex"), source);
|
||
}
|
||
//Log.i(TAG, "to get chaps values==null? :" + (values==null) );
|
||
for(String s:values){
|
||
//Log.i(TAG, "to get chaps value :" + s);
|
||
}
|
||
if (values != null) context.put(regexJson.getString("name"), values);
|
||
|
||
|
||
// String child = siteJson.getString("child");
|
||
//Log.i(TAG, "to get chaps siteJson.has(\"child\")? :" + siteJson.has("child"));
|
||
if ( regexJson.has("child") ) {
|
||
// 一般用来做返回结果用的
|
||
JSONObject childJson = regexJson.getJSONObject("child");
|
||
//Log.i(TAG, "to get chaps childJson :" + childJson.toString());
|
||
for (String value : values) {
|
||
//Log.i(TAG, "to get chaps value in values :" + value);
|
||
String[] values2 = null;
|
||
//Log.i(TAG, "to get chaps childJson.getBoolean(\"group\") :" + childJson.getBoolean("group"));
|
||
if (childJson.getBoolean("group")) {
|
||
values2 = REUtil.groups(childJson.getString("regex"), value);
|
||
} else {
|
||
values2 = REUtil.matchs(childJson.getString("regex"), value);
|
||
}
|
||
//Log.i(TAG, "to get chaps values2 != null ? :" + (values2 != null));
|
||
if (values2 != null) {
|
||
//Log.i(TAG, "to get chaps value in values2 :" + value);
|
||
context.put(childJson.getString("name"), values2);
|
||
//Log.i(TAG, "to get chaps siteJson.has(\"output\") :" + siteJson.has("output"));
|
||
if (childJson.has("output")) {
|
||
JSONArray outputArray = childJson.getJSONArray("output");
|
||
for (int m = 0; m < outputArray.length(); m++) {
|
||
String v = getContent(outputArray.getString(m), context);
|
||
//Log.i(TAG, "to get chaps v :" + v);
|
||
if (v != null) result.add(v);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
//Log.i(TAG, "to get chaps <---------------------------- result size :" + result.size());
|
||
return result.toArray(new String[0]);
|
||
}
|
||
|
||
private static String getContent(String var, Map<String, Object> context) {
|
||
String[] vs = REUtil.matchs("\\{[^\\}]+?\\}", var);
|
||
for (String v : vs) {
|
||
String vn = v.substring(1, v.length() - 1);
|
||
String value = "";
|
||
if (vn.endsWith("]")) {
|
||
int pos = vn.indexOf("[");
|
||
if (pos == -1) continue;
|
||
int index =Integer.parseInt(vn.substring(pos + 1, vn.length() - 1));
|
||
Object ov = context.get(vn.substring(0, pos));
|
||
if (ov == null) continue;
|
||
if (ov.getClass().isArray()) {
|
||
String[] ovs = (String[])ov;
|
||
if (index >= ovs.length) continue;
|
||
value = ovs[index];
|
||
}
|
||
} else {
|
||
if (context.get(vn) == null) continue;
|
||
value = context.get(vn).toString();
|
||
}
|
||
|
||
int pos = var.indexOf(v);
|
||
var = var.substring(0, pos) + value + var.substring(pos + vn.length() + 2);
|
||
}
|
||
|
||
return var;
|
||
}
|
||
|
||
|
||
private static String access(String url,int maxAge, SiteRule siteRule) {
|
||
if(TextUtils.isEmpty(url) || !url.toLowerCase().startsWith("http") || !url.toLowerCase().startsWith("https")){
|
||
return "";
|
||
}
|
||
Request.Builder builder = new Request.Builder()
|
||
// .tag(mNovel.getNovelId()) //标记 请求的tag,切换小说或离开小说界面(BookActivity) 时 取消未执行完毕的 此tag的所有请求
|
||
.url(url)
|
||
.removeHeader("Pragma")
|
||
.header("Cache-Control", "public, max-age=" + maxAge)
|
||
// .header("Accept-Encoding","gzip, deflate, sdch")
|
||
// .header("Accept-Language","zh-CN,zh;q=0.9")
|
||
// .header( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
|
||
// .header( "Upgrade-Insecure-Requests", "1")
|
||
// .header("content-type", "text/html; charset=utf-8")
|
||
;
|
||
for(int i=0;i<siteRule.getHeaders().length;i+=2){
|
||
builder.header(siteRule.getHeaders()[i],siteRule.getHeaders()[i+1]);
|
||
|
||
}
|
||
if(siteRule.getUserAgents()!=null && siteRule.getUserAgents().length>0){
|
||
|
||
builder.removeHeader("User-Agent").addHeader("User-Agent", siteRule.getUserAgents()[new Random().nextInt( siteRule.getUserAgents().length-1)]); //加 随机agent
|
||
|
||
}else{
|
||
builder.removeHeader("User-Agent").addHeader("User-Agent", HttpMethods.USERAGENT);
|
||
}
|
||
Request request =builder.build() ;
|
||
Response response = null;
|
||
try {
|
||
response = HttpMethods.getOkClient().newCall(request).execute();
|
||
//String s =response.body().string();
|
||
|
||
String s = enconding(response.body(),siteRule.getEncoding()); //new String(response.body().bytes(), encoding);
|
||
// response.body().close();
|
||
long st = new java.util.Date().getTime();
|
||
//Log.i(TAG, "to get chaps access result:" + s );
|
||
return s;
|
||
// return enconding(s,encoding);
|
||
// return info;
|
||
} catch (IOException e) {
|
||
e.printStackTrace();
|
||
Log.e(TAG, "access: ", e);
|
||
}finally {
|
||
if(response!=null)
|
||
response.body().close();
|
||
}
|
||
return "";
|
||
|
||
}
|
||
|
||
public static String enconding(ResponseBody body, String encode) throws UnsupportedEncodingException {
|
||
String s="";
|
||
try{
|
||
Charset charset = body.contentType().charset();
|
||
if(charset!=null){
|
||
s= body.string();
|
||
}else {
|
||
s= new String(body.bytes(), encode);
|
||
}
|
||
}catch (Exception er){
|
||
|
||
}finally {
|
||
body.close();
|
||
}
|
||
return s;
|
||
|
||
}
|
||
|
||
|
||
|
||
private static boolean isBlank(String value) {
|
||
return value == null || "".equals(value);
|
||
}
|
||
|
||
public static String getFullUrl(String url, String referer) {
|
||
if ( isBlank(referer) ) {
|
||
if (url.startsWith("//")) {
|
||
return "http:" + url;
|
||
} else {
|
||
return url;
|
||
}
|
||
} else if (url.startsWith("//")) {
|
||
if (referer.toLowerCase().startsWith("https:")) {
|
||
return "https:" + url;
|
||
} else {
|
||
return "http:" + url;
|
||
}
|
||
}
|
||
|
||
String lowerCaseUrl = url.toLowerCase();
|
||
if (lowerCaseUrl.startsWith("http://") || lowerCaseUrl.startsWith("https://")) {
|
||
return url;
|
||
}
|
||
if (url.startsWith("/")) {
|
||
int index = referer.indexOf("/", 8);
|
||
String host = referer;
|
||
if (index > -1) {
|
||
host = referer.substring(0, index);
|
||
}
|
||
return host + url;
|
||
} else {
|
||
int index = referer.lastIndexOf("/");
|
||
String prefix = referer;
|
||
if (index > 7) {
|
||
prefix = referer.substring(0, index);
|
||
}
|
||
return prefix + "/" + url;
|
||
}
|
||
}
|
||
}
|