pda/zhuike/.svn/pristine/88/883e8f7c9b3547f91491cb27cad...

392 lines
15 KiB
Plaintext
Raw Normal View History

2024-02-06 22:23:29 +08:00
package com.novelbook.android.utils;
import android.text.TextUtils;
import android.util.Log;
import com.novelbook.android.db.Chapter;
import com.novelbook.android.db.SiteRule;
import com.novelbook.android.netutils.HttpMethods;
import com.novelbook.android.netutils.NetUtil;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import okhttp3.Request;
import okhttp3.Response;
import okhttp3.ResponseBody;
public class NovelParseUtil {
private static final String TAG=NovelParseUtil.class.getSimpleName();
private static final String A_Regex =Constants.A_Regex ;//"<a[^>]+href[\\s]*=[\\s]*['\"]?([^'\"]+)['\"\\s]?[^>]*>([^<]+)<";
public static String[] getChaptersArray(String muluUrl, String html, JSONObject siteJson) throws JSONException {
Map<String, String> muluMap = getChaptersMap(muluUrl, html, siteJson);
String[] values = new String[muluMap.size() * 2];
Set<Map.Entry<String, String>> es = muluMap.entrySet();
int pos = values.length - 2;
for (Map.Entry<String, String> e : es) {
values[pos] = e.getKey();
values[pos + 1] = e.getValue();
pos -= 2;
}
return values;
}
public static List<Chapter> getChapters(String domain,String muluUrl, String html, JSONObject siteJson) throws JSONException {
Map<String, String> muluMap = getChaptersMap(muluUrl, html, siteJson);
if(muluMap==null){
return new ArrayList<Chapter>();
}
Chapter[] tmp = new Chapter[muluMap.size()];
Set<Map.Entry<String, String>> es = muluMap.entrySet();
int pos = tmp.length - 1;
for (Map.Entry<String, String> e : es) {
Chapter chapter = new Chapter();
chapter.setChapterUrl( e.getKey());
chapter.setChapterName( e.getValue());
chapter.setDomain(domain);
chapter.setIndex(pos+1); //第几章
tmp[pos--] =chapter;
}
List<Chapter> values = new ArrayList<Chapter>(Arrays.asList(tmp));
return values;
}
public static Map<String, String> getChaptersMap(String muluUrl, String html, JSONObject siteJson) throws JSONException {
String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu");
String chapterUrlPattern = siteJson.getString("chapterUrlPattern");
//Log.i(TAG, "getChaptersMap: chapterUrlRegexOnMulu: " + chapterUrlRegexOnMulu);
Map<String, String> muluMap = new LinkedHashMap<String, String>();
String regex = A_Regex;
if (!isBlank(chapterUrlRegexOnMulu)) {
regex = chapterUrlRegexOnMulu;
}
//Log.i(TAG, "getChaptersMap: regex: " + regex);
String[] rows = REUtil.matchs(regex, html);;
if (rows == null || rows.length == 0) return null;
for (int i = rows.length - 1; i >= 0; i--) {
String row = rows[i];
String[] parts = REUtil.groups(regex, row);
if (parts == null || parts.length == 0) continue;
String href = getFullUrl(parts[0], muluUrl);
if (muluMap.containsKey(href)) continue;
if (isBlank(REUtil.match(chapterUrlPattern, href))) continue;
String name = parts[1];
muluMap.put(href, name);
}
return muluMap;
}
public static String getChapterContent(String html, JSONObject siteJson) throws JSONException {
String chapterContentRegex = siteJson.getString("chapterContentRegex");
String text = REUtil.group(chapterContentRegex, html, 1);
if (isBlank(text)) return "";
String chapterContentDumpRegex = siteJson.getString("chapterContentDumpRegex");
if (!isBlank(chapterContentDumpRegex)) {
text = text.replaceAll(chapterContentDumpRegex, "");
}
text = text.replaceAll("<![^>]+?>", "");
// p br --> \n
text = text.replaceAll("(?i)<[/]?[\\s]*p[^>]*>|<[/]*br[/ ]*>", "\n");
text = text.replaceAll("[']+", "");
text = text.replaceAll("&#[\\w\\d]+;", "");
text = text.replaceAll("(?i)&nbsp;", " ");
text = text.replaceAll("[ ]{2}", " ");
text = text.replaceAll("[ ]{3,}", "  ");
text = text.replaceAll("<<", "<");
text = text.replaceAll("&[\\w\\d]{4};", "").replaceAll("[\\w\\d]{4};", "");
text = text.replaceAll("(?i)<script[\\s\\S]+</Script>", "")
.replaceAll("<[^>]*>", "");
text = text.replaceAll("\\n[\\s ]+\\n", "\n");
if (!text.startsWith(" ")) text = "  " + text;
return text.trim();
}
public static List<Chapter> getChaptersLst(String[] rows,String domain){
ArrayList<Chapter> lst = new ArrayList<Chapter>();
int j=0;
for (int i=0;i<rows.length;i+=2) {
j++;
Chapter chapter = new Chapter();
chapter.setChapterUrl( rows[i]);
chapter.setChapterName( rows[i+1]);
chapter.setDomain(domain);
chapter.setIndex(j); //第几章
lst.add(chapter);
}
return lst;
}
public static List<Chapter> getChapters(String url, JSONObject siteJson,String siteName,int maxAage,SiteRule siteRule) throws JSONException {
return getChaptersLst(getChapters(url,siteJson,maxAage,siteRule),siteName);
}
public static String[] getChapters(String url, JSONObject siteJson, int maxAge,SiteRule siteRule) throws JSONException {
//if (!siteJson.keys().("chapterUrlRegexOnMulu")) return null;
String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu");
// if(TextUtils.isEmpty(chapterUrlRegexOnMulu)) return null;
if(!siteJson.has("chapterUrlRegexOnMulu")){
return null ;
}
JSONArray muluArray = siteJson.getJSONArray("chapterUrlRegexOnMulu");
if (muluArray == null || muluArray.length()== 0) return null;
//Log.i(TAG, "to get chaps muluArray is null: " +( muluArray ==null) );
Map<String, Object> context = new HashMap<String, Object>();
context.put("url", url);
//Log.i(TAG, "to get chaps url:" + url );
List<String> result = new ArrayList<String>();
// 最外部的大的规则对象
for (int i = 0; i < muluArray.length(); i++) {
JSONObject regexsJson = muluArray.getJSONObject(i);
String source = regexsJson.getString("source");
source = getContent(source, context);
//Log.i(TAG, "to get chaps source:" + source );
if (source.startsWith("html:")) {
String _url = source.substring("html:".length());
source = access(_url,maxAge,siteRule );
//Log.i(TAG, "to get chaps source:" + source );
}
// 第一次Regex对象
JSONArray regexsArray = regexsJson.getJSONArray("regexs");
//Log.i(TAG, "to get chaps regexsArray.length():" + regexsArray.length() );
for (int j = 0; j < regexsArray.length(); j++) {
JSONObject regexJson = regexsArray.getJSONObject(j);
String[] values = null;
//Log.i(TAG, "to get chaps regexJson.getBoolean(\"group\"):" + regexJson.getBoolean("group") );
if (regexJson.getBoolean("group")) {
values = REUtil.groups(regexJson.getString("regex"), source);
} else {
values = REUtil.matchs(regexJson.getString("regex"), source);
}
//Log.i(TAG, "to get chaps values==null? :" + (values==null) );
for(String s:values){
//Log.i(TAG, "to get chaps value :" + s);
}
if (values != null) context.put(regexJson.getString("name"), values);
// String child = siteJson.getString("child");
//Log.i(TAG, "to get chaps siteJson.has(\"child\")? :" + siteJson.has("child"));
if ( regexJson.has("child") ) {
// 一般用来做返回结果用的
JSONObject childJson = regexJson.getJSONObject("child");
//Log.i(TAG, "to get chaps childJson :" + childJson.toString());
for (String value : values) {
//Log.i(TAG, "to get chaps value in values :" + value);
String[] values2 = null;
//Log.i(TAG, "to get chaps childJson.getBoolean(\"group\") :" + childJson.getBoolean("group"));
if (childJson.getBoolean("group")) {
values2 = REUtil.groups(childJson.getString("regex"), value);
} else {
values2 = REUtil.matchs(childJson.getString("regex"), value);
}
//Log.i(TAG, "to get chaps values2 != null :" + (values2 != null));
if (values2 != null) {
//Log.i(TAG, "to get chaps value in values2 :" + value);
context.put(childJson.getString("name"), values2);
//Log.i(TAG, "to get chaps siteJson.has(\"output\") :" + siteJson.has("output"));
if (childJson.has("output")) {
JSONArray outputArray = childJson.getJSONArray("output");
for (int m = 0; m < outputArray.length(); m++) {
String v = getContent(outputArray.getString(m), context);
//Log.i(TAG, "to get chaps v :" + v);
if (v != null) result.add(v);
}
}
}
}
}
}
}
//Log.i(TAG, "to get chaps <---------------------------- result size :" + result.size());
return result.toArray(new String[0]);
}
private static String getContent(String var, Map<String, Object> context) {
String[] vs = REUtil.matchs("\\{[^\\}]+?\\}", var);
for (String v : vs) {
String vn = v.substring(1, v.length() - 1);
String value = "";
if (vn.endsWith("]")) {
int pos = vn.indexOf("[");
if (pos == -1) continue;
int index =Integer.parseInt(vn.substring(pos + 1, vn.length() - 1));
Object ov = context.get(vn.substring(0, pos));
if (ov == null) continue;
if (ov.getClass().isArray()) {
String[] ovs = (String[])ov;
if (index >= ovs.length) continue;
value = ovs[index];
}
} else {
if (context.get(vn) == null) continue;
value = context.get(vn).toString();
}
int pos = var.indexOf(v);
var = var.substring(0, pos) + value + var.substring(pos + vn.length() + 2);
}
return var;
}
private static String access(String url,int maxAge, SiteRule siteRule) {
if(TextUtils.isEmpty(url) || !url.toLowerCase().startsWith("http") || !url.toLowerCase().startsWith("https")){
return "";
}
Request.Builder builder = new Request.Builder()
// .tag(mNovel.getNovelId()) //标记 请求的tag,切换小说或离开小说界面(BookActivity) 时 取消未执行完毕的 此tag的所有请求
.url(url)
.removeHeader("Pragma")
.header("Cache-Control", "public, max-age=" + maxAge)
// .header("Accept-Encoding","gzip, deflate, sdch")
// .header("Accept-Language","zh-CN,zh;q=0.9")
// .header( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
// .header( "Upgrade-Insecure-Requests", "1")
// .header("content-type", "text/html; charset=utf-8")
;
for(int i=0;i<siteRule.getHeaders().length;i+=2){
builder.header(siteRule.getHeaders()[i],siteRule.getHeaders()[i+1]);
}
if(siteRule.getUserAgents()!=null && siteRule.getUserAgents().length>0){
builder.removeHeader("User-Agent").addHeader("User-Agent", siteRule.getUserAgents()[new Random().nextInt( siteRule.getUserAgents().length-1)]); //加 随机agent
}else{
builder.removeHeader("User-Agent").addHeader("User-Agent", HttpMethods.USERAGENT);
}
Request request =builder.build() ;
Response response = null;
try {
response = HttpMethods.getOkClient().newCall(request).execute();
//String s =response.body().string();
String s = enconding(response.body(),siteRule.getEncoding()); //new String(response.body().bytes(), encoding);
// response.body().close();
long st = new java.util.Date().getTime();
//Log.i(TAG, "to get chaps access result:" + s );
return s;
// return enconding(s,encoding);
// return info;
} catch (IOException e) {
e.printStackTrace();
Log.e(TAG, "access: ", e);
}finally {
if(response!=null)
response.body().close();
}
return "";
}
public static String enconding(ResponseBody body, String encode) throws UnsupportedEncodingException {
String s="";
try{
Charset charset = body.contentType().charset();
if(charset!=null){
s= body.string();
}else {
s= new String(body.bytes(), encode);
}
}catch (Exception er){
}finally {
body.close();
}
return s;
}
private static boolean isBlank(String value) {
return value == null || "".equals(value);
}
public static String getFullUrl(String url, String referer) {
if ( isBlank(referer) ) {
if (url.startsWith("//")) {
return "http:" + url;
} else {
return url;
}
} else if (url.startsWith("//")) {
if (referer.toLowerCase().startsWith("https:")) {
return "https:" + url;
} else {
return "http:" + url;
}
}
String lowerCaseUrl = url.toLowerCase();
if (lowerCaseUrl.startsWith("http://") || lowerCaseUrl.startsWith("https://")) {
return url;
}
if (url.startsWith("/")) {
int index = referer.indexOf("/", 8);
String host = referer;
if (index > -1) {
host = referer.substring(0, index);
}
return host + url;
} else {
int index = referer.lastIndexOf("/");
String prefix = referer;
if (index > 7) {
prefix = referer.substring(0, index);
}
return prefix + "/" + url;
}
}
}