爬虫应用
原创大约 2 分钟约 691 字
模拟登录
依赖
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.49.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>com.springsource.com.gargoylesoftware.htmlunit</artifactId>
<version>2.6.0</version>
</dependency>
功能实现
WebClient webClient = new WebClient();
// webClient.setJavaScriptEnabled(false);
// 取消 CSS 支持
// webClient.setCssEnabled(false);
//用例2设置方式
// 取消 JS 支持
webClient.getOptions().setJavaScriptEnabled(false);
// 取消 CSS 支持
webClient.getOptions().setCssEnabled(false);
HtmlPage page=null;
try{
// 获取指定网页实体
page = (HtmlPage) webClient.getPage(URL);
// System.out.println (page.asText ());
//获取用户
HtmlInput user = page.getHtmlElementById("MainContent_LoginUser_UserName");
user.setValueAttribute ("用户");
//获取密码
HtmlInput pass = page.getHtmlElementById("MainContent_LoginUser_Password");
pass.setValueAttribute ("密码");
//获取按钮
HtmlInput btn = page.getHtmlElementById("MainContent_LoginUser_LoginButton");
//登录获取内容
HtmlPage click = btn.click ();
click.getByXPath ("//*[@id=\"MainContent_Panel1\"]/div/div[1]/h2").forEach (item->{
System.out.println (((HtmlElement)item).asText ());
});
DomNodeList<DomElement> a = click.getElementsByTagName ("a");
a.forEach (item->{
if(((HtmlElement)item).asText ().equals ("销售查询")){
try {
HtmlPage click1 = ((HtmlElement) item).click ();
System.out.println (click1.asText ());
} catch (IOException e) {
e.printStackTrace ();
}
}
});
}catch (MalformedURLException e){
e.printStackTrace();
}catch (IOException e){
e.printStackTrace();
}
获取网页数据
依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
功能实现
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Jsoup;
import java.util.HashMap;
import java.util.Map;
import java.text.SimpleDateFormat;
import java.awt.Toolkit;
import java.awt.datatransfer.StringSelection;
import java.io.File;
import java.io.IOException;
import com.clzhang.util.Base64Util;
public class PredictThread extends Thread {
// 验证码图片路径
String imagePath = "D:\\TDDOWNLOAD\\code\\mycode.gif";
// 文件最后修改时间
long lastModified = 0L;
// 登录用户名
String username = "usertest";
// 登录密码
String password = "password123";
// 备注字段: 可以不写
String remark = "输出计算结果";
// 默认 3 数英混合
String typeid = "3";
// 处理总数量
int count = 0;
// 初始化时,读取文件最后修改时间。此段代码也可以放到实例初始化块中。
public PredictThread() {
File file = new File(imagePath);
lastModified = file.lastModified();
}
@Override
public void run() {
while (true) {
try {
// 检查图片是否更新
File file = new File(imagePath);
long time = file.lastModified();
if (time > lastModified) {
count++;
// 清空剪贴板
Toolkit.getDefaultToolkit().getSystemClipboard().setContents(new StringSelection(""), null);
// 开始处理
lastModified = time;
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
System.out.println("有新的验证码需要上传……" + sdf.format(time));
Map<String, String> data = new HashMap<>();
data.put("username", username);
data.put("password", password);
data.put("typeid", "3");
data.put("remark", remark);
data.put("image", Base64Util.getImageStr(imagePath));
try {
String resultString = Jsoup.connect("http://api.somesite.com/predict")
.requestBody(JSON.toJSONString(data))
.header("Content-Type", "application/json")
.ignoreContentType(true)
.timeout(120000)
.post()
.text();
JSONObject jsonObject = JSONObject.parseObject(resultString);
if (jsonObject.getBoolean("success")) {
String result = jsonObject.getJSONObject("data").getString("result");
// 如果长度不正确,即识别错误,修正
if(result.length() < 4 ) {
for(int i=result.length();i<=4;i++) {
result = result + "1";
}
}else if (result.length() > 4 ) {
result = result.substring(0, 4);
}
// 将返回结果存放到剪贴板中,等待前台调用
StringSelection selection = new StringSelection(result);
Toolkit.getDefaultToolkit().getSystemClipboard().setContents(selection, null);
System.out.println(count + " : " + result);
} else {
// 识别错误,放一个值到剪贴板中,保证前台程序往下走
StringSelection selection = new StringSelection("0000");
Toolkit.getDefaultToolkit().getSystemClipboard().setContents(selection, null);
System.out.println("识别失败原因为:" + jsonObject.getString("message"));
}
} catch (IOException ex) {
// 如果程序走到这里,比如说网络异常,暂时不处理,即程序停止:因为没有返回数据给前台。
ex.printStackTrace();
}
}
// 间隔时间
sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws Exception {
PredictThread ins = new PredictThread();
ins.start();
System.out.println("守候线程已经启动……");
}
}