pda/zhuike/.svn/pristine/70/70457a8fdbdff2f8674621ae811...

388 lines
15 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package com.novelbook.android.utils;
import android.util.Log;
import com.novelbook.android.db.Chapter;
import com.novelbook.android.db.SiteRule;
import com.novelbook.android.netutils.HttpMethods;
import com.novelbook.android.netutils.NetUtil;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import okhttp3.Request;
import okhttp3.Response;
import okhttp3.ResponseBody;
public class NovelParseUtil {
private static final String TAG=NovelParseUtil.class.getSimpleName();
private static final String A_Regex =Constants.A_Regex ;//"<a[^>]+href[\\s]*=[\\s]*['\"]?([^'\"]+)['\"\\s]?[^>]*>([^<]+)<";
public static String[] getChaptersArray(String muluUrl, String html, JSONObject siteJson) throws JSONException {
Map<String, String> muluMap = getChaptersMap(muluUrl, html, siteJson);
String[] values = new String[muluMap.size() * 2];
Set<Map.Entry<String, String>> es = muluMap.entrySet();
int pos = values.length - 2;
for (Map.Entry<String, String> e : es) {
values[pos] = e.getKey();
values[pos + 1] = e.getValue();
pos -= 2;
}
return values;
}
public static List<Chapter> getChapters(String domain,String muluUrl, String html, JSONObject siteJson) throws JSONException {
Map<String, String> muluMap = getChaptersMap(muluUrl, html, siteJson);
if(muluMap==null){
return new ArrayList<Chapter>();
}
Chapter[] tmp = new Chapter[muluMap.size()];
Set<Map.Entry<String, String>> es = muluMap.entrySet();
int pos = tmp.length - 1;
for (Map.Entry<String, String> e : es) {
Chapter chapter = new Chapter();
chapter.setChapterUrl( e.getKey());
chapter.setChapterName( e.getValue());
chapter.setDomain(domain);
chapter.setIndex(pos+1); //第几章
tmp[pos--] =chapter;
}
List<Chapter> values = new ArrayList<Chapter>(Arrays.asList(tmp));
return values;
}
public static Map<String, String> getChaptersMap(String muluUrl, String html, JSONObject siteJson) throws JSONException {
String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu");
String chapterUrlPattern = siteJson.getString("chapterUrlPattern");
//Log.i(TAG, "getChaptersMap: chapterUrlRegexOnMulu: " + chapterUrlRegexOnMulu);
Map<String, String> muluMap = new LinkedHashMap<String, String>();
String regex = A_Regex;
if (!isBlank(chapterUrlRegexOnMulu)) {
regex = chapterUrlRegexOnMulu;
}
//Log.i(TAG, "getChaptersMap: regex: " + regex);
String[] rows = REUtil.matchs(regex, html);;
if (rows == null || rows.length == 0) return null;
for (int i = rows.length - 1; i >= 0; i--) {
String row = rows[i];
String[] parts = REUtil.groups(regex, row);
if (parts == null || parts.length == 0) continue;
String href = getFullUrl(parts[0], muluUrl);
if (muluMap.containsKey(href)) continue;
if (isBlank(REUtil.match(chapterUrlPattern, href))) continue;
String name = parts[1];
muluMap.put(href, name);
}
return muluMap;
}
public static String getChapterContent(String html, JSONObject siteJson) throws JSONException {
String chapterContentRegex = siteJson.getString("chapterContentRegex");
String text = REUtil.group(chapterContentRegex, html, 1);
if (isBlank(text)) return "";
String chapterContentDumpRegex = siteJson.getString("chapterContentDumpRegex");
if (!isBlank(chapterContentDumpRegex)) {
text = text.replaceAll(chapterContentDumpRegex, "");
}
text = text.replaceAll("<![^>]+?>", "");
// p br --> \n
text = text.replaceAll("(?i)<[/]?[\\s]*p[^>]*>|<[/]*br[/ ]*>", "\n");
text = text.replaceAll("[']+", "");
text = text.replaceAll("&#[\\w\\d]+;", "");
text = text.replaceAll("(?i)&nbsp;", " ");
text = text.replaceAll("[ ]{2}", " ");
text = text.replaceAll("[ ]{3,}", "  ");
text = text.replaceAll("<<", "<");
text = text.replaceAll("&[\\w\\d]{4};", "").replaceAll("[\\w\\d]{4};", "");
text = text.replaceAll("(?i)<script[\\s\\S]+</Script>", "")
.replaceAll("<[^>]*>", "");
text = text.replaceAll("\\n[\\s ]+\\n", "\n");
if (!text.startsWith(" ")) text = "  " + text;
return text.trim();
}
public static List<Chapter> getChaptersLst(String[] rows,String domain){
ArrayList<Chapter> lst = new ArrayList<Chapter>();
int j=0;
for (int i=0;i<rows.length;i+=2) {
j++;
Chapter chapter = new Chapter();
chapter.setChapterUrl( rows[i]);
chapter.setChapterName( rows[i+1]);
chapter.setDomain(domain);
chapter.setIndex(j); //第几章
lst.add(chapter);
}
return lst;
}
public static List<Chapter> getChapters(String url, JSONObject siteJson,String siteName,int maxAage,SiteRule siteRule) throws JSONException {
return getChaptersLst(getChapters(url,siteJson,maxAage,siteRule),siteName);
}
public static String[] getChapters(String url, JSONObject siteJson, int maxAge,SiteRule siteRule) throws JSONException {
//if (!siteJson.keys().("chapterUrlRegexOnMulu")) return null;
String chapterUrlRegexOnMulu = siteJson.getString("chapterUrlRegexOnMulu");
// if(TextUtils.isEmpty(chapterUrlRegexOnMulu)) return null;
if(!siteJson.has("chapterUrlRegexOnMulu")){
return null ;
}
JSONArray muluArray = siteJson.getJSONArray("chapterUrlRegexOnMulu");
if (muluArray == null || muluArray.length()== 0) return null;
//Log.i(TAG, "to get chaps muluArray is null: " +( muluArray ==null) );
Map<String, Object> context = new HashMap<String, Object>();
context.put("url", url);
//Log.i(TAG, "to get chaps url:" + url );
List<String> result = new ArrayList<String>();
// 最外部的大的规则对象
for (int i = 0; i < muluArray.length(); i++) {
JSONObject regexsJson = muluArray.getJSONObject(i);
String source = regexsJson.getString("source");
source = getContent(source, context);
//Log.i(TAG, "to get chaps source:" + source );
if (source.startsWith("html:")) {
String _url = source.substring("html:".length());
source = access(_url,maxAge,siteRule );
//Log.i(TAG, "to get chaps source:" + source );
}
// 第一次Regex对象
JSONArray regexsArray = regexsJson.getJSONArray("regexs");
//Log.i(TAG, "to get chaps regexsArray.length():" + regexsArray.length() );
for (int j = 0; j < regexsArray.length(); j++) {
JSONObject regexJson = regexsArray.getJSONObject(j);
String[] values = null;
//Log.i(TAG, "to get chaps regexJson.getBoolean(\"group\"):" + regexJson.getBoolean("group") );
if (regexJson.getBoolean("group")) {
values = REUtil.groups(regexJson.getString("regex"), source);
} else {
values = REUtil.matchs(regexJson.getString("regex"), source);
}
//Log.i(TAG, "to get chaps values==null? :" + (values==null) );
for(String s:values){
//Log.i(TAG, "to get chaps value :" + s);
}
if (values != null) context.put(regexJson.getString("name"), values);
// String child = siteJson.getString("child");
//Log.i(TAG, "to get chaps siteJson.has(\"child\")? :" + siteJson.has("child"));
if ( regexJson.has("child") ) {
// 一般用来做返回结果用的
JSONObject childJson = regexJson.getJSONObject("child");
//Log.i(TAG, "to get chaps childJson :" + childJson.toString());
for (String value : values) {
//Log.i(TAG, "to get chaps value in values :" + value);
String[] values2 = null;
//Log.i(TAG, "to get chaps childJson.getBoolean(\"group\") :" + childJson.getBoolean("group"));
if (childJson.getBoolean("group")) {
values2 = REUtil.groups(childJson.getString("regex"), value);
} else {
values2 = REUtil.matchs(childJson.getString("regex"), value);
}
//Log.i(TAG, "to get chaps values2 != null :" + (values2 != null));
if (values2 != null) {
//Log.i(TAG, "to get chaps value in values2 :" + value);
context.put(childJson.getString("name"), values2);
//Log.i(TAG, "to get chaps siteJson.has(\"output\") :" + siteJson.has("output"));
if (childJson.has("output")) {
JSONArray outputArray = childJson.getJSONArray("output");
for (int m = 0; m < outputArray.length(); m++) {
String v = getContent(outputArray.getString(m), context);
//Log.i(TAG, "to get chaps v :" + v);
if (v != null) result.add(v);
}
}
}
}
}
}
}
//Log.i(TAG, "to get chaps <---------------------------- result size :" + result.size());
return result.toArray(new String[0]);
}
private static String getContent(String var, Map<String, Object> context) {
String[] vs = REUtil.matchs("\\{[^\\}]+?\\}", var);
for (String v : vs) {
String vn = v.substring(1, v.length() - 1);
String value = "";
if (vn.endsWith("]")) {
int pos = vn.indexOf("[");
if (pos == -1) continue;
int index =Integer.parseInt(vn.substring(pos + 1, vn.length() - 1));
Object ov = context.get(vn.substring(0, pos));
if (ov == null) continue;
if (ov.getClass().isArray()) {
String[] ovs = (String[])ov;
if (index >= ovs.length) continue;
value = ovs[index];
}
} else {
if (context.get(vn) == null) continue;
value = context.get(vn).toString();
}
int pos = var.indexOf(v);
var = var.substring(0, pos) + value + var.substring(pos + vn.length() + 2);
}
return var;
}
private static String access(String url,int maxAge, SiteRule siteRule) {
Request.Builder builder = new Request.Builder()
// .tag(mNovel.getNovelId()) //标记 请求的tag,切换小说或离开小说界面(BookActivity) 时 取消未执行完毕的 此tag的所有请求
.url(url)
.removeHeader("Pragma")
.header("Cache-Control", "public, max-age=" + maxAge)
// .header("Accept-Encoding","gzip, deflate, sdch")
// .header("Accept-Language","zh-CN,zh;q=0.9")
// .header( "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
// .header( "Upgrade-Insecure-Requests", "1")
// .header("content-type", "text/html; charset=utf-8")
;
for(int i=0;i<siteRule.getHeaders().length;i+=2){
builder.header(siteRule.getHeaders()[i],siteRule.getHeaders()[i+1]);
}
if(siteRule.getUserAgents()!=null && siteRule.getUserAgents().length>0){
builder.removeHeader("User-Agent").addHeader("User-Agent", siteRule.getUserAgents()[new Random().nextInt( siteRule.getUserAgents().length-1)]); //加 随机agent
}else{
builder.removeHeader("User-Agent").addHeader("User-Agent", HttpMethods.USERAGENT);
}
Request request =builder.build() ;
Response response = null;
try {
response = HttpMethods.getOkClient().newCall(request).execute();
//String s =response.body().string();
String s = enconding(response.body(),siteRule.getEncoding()); //new String(response.body().bytes(), encoding);
// response.body().close();
long st = new java.util.Date().getTime();
//Log.i(TAG, "to get chaps access result:" + s );
return s;
// return enconding(s,encoding);
// return info;
} catch (IOException e) {
e.printStackTrace();
Log.e(TAG, "access: ", e);
}finally {
if(response!=null)
response.body().close();
}
return "";
}
public static String enconding(ResponseBody body, String encode) throws UnsupportedEncodingException {
String s="";
try{
Charset charset = body.contentType().charset();
if(charset!=null){
s= body.string();
}else {
s= new String(body.bytes(), encode);
}
}catch (Exception er){
}finally {
body.close();
}
return s;
}
private static boolean isBlank(String value) {
return value == null || "".equals(value);
}
public static String getFullUrl(String url, String referer) {
if ( isBlank(referer) ) {
if (url.startsWith("//")) {
return "http:" + url;
} else {
return url;
}
} else if (url.startsWith("//")) {
if (referer.toLowerCase().startsWith("https:")) {
return "https:" + url;
} else {
return "http:" + url;
}
}
String lowerCaseUrl = url.toLowerCase();
if (lowerCaseUrl.startsWith("http://") || lowerCaseUrl.startsWith("https://")) {
return url;
}
if (url.startsWith("/")) {
int index = referer.indexOf("/", 8);
String host = referer;
if (index > -1) {
host = referer.substring(0, index);
}
return host + url;
} else {
int index = referer.lastIndexOf("/");
String prefix = referer;
if (index > 7) {
prefix = referer.substring(0, index);
}
return prefix + "/" + url;
}
}
}