Jsoup实现实时爬取
文章目录
- 1.作用
- 2.使用
- pom文件引入
- 示例代码(来自官网)
- 3.测试代码
- 4.上线代码
- 控制层
- 业务层
1.作用
获取Html文档,然后解析出需要的字段
2.使用
pom文件引入
<!-- https://mvnrepository.com/artifact/org.springframework.boot/spring-boot-starter-data-redis -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
<version>3.4.0</version>
</dependency>
示例代码(来自官网)
Document doc = Jsoup.connect("https://en.wikipedia.org/").get();
log(doc.title());
Elements newsHeadlines = doc.select("#mp-itn b a");
for (Element headline : newsHeadlines) {
log("%s\n\t%s",
headline.attr("title"), headline.absUrl("href"));
}
3.测试代码
/**
* 实时爬虫获取图片列表
*/
@Test
void getPicture() throws IOException {
int page = 1;
//要解析的网页地址
String url = "https://cn.xxxx.com/images/search?q=小黑子&first="+page;
//获取html文档
Document doc = Jsoup.connect(url).get();
//查找该文档对应class的文档
Elements element = doc.select(".iuscp.isv");
List<Picture> pictureList = new ArrayList<>();
//将该文档下的该类选择器的文档遍历一遍
for (Element element1 : element) {
//获取图片路径
String urlImage = element1.select(".iusc").get(0).attr("m");
Map<String,Object> urlMap = JSONUtil.toBean(urlImage,Map.class);
String urlImg = (String)urlMap.get("murl");
//获取标题
String title = element1.select(".inflnk").get(0).attr("aria-label");
//添加到集合
Picture picture = new Picture();
picture.setUrlImage(urlImg);
picture.setTitle(title);
pictureList.add(picture);
}
System.out.println(pictureList);
}
4.上线代码
控制层
/**
* 最终放入搜索接口
* @param pictureDto
* @return
*/
@GetMapping("/list/vo")
public Result getPictureList(@RequestBody PictureDto pictureDto){
Integer page = pictureDto.getPage();
Integer pageSize = pictureDto.getPageSize();
//限制搜索数据量过大
if(page > 20){
throw new DataSizeBigException(MessageConstant.DATA_SIZE_BIG_ERROR);
}
String searchText = pictureDto.getSearText();
IPage<Picture> pageBean = pictureService.searchPicture(searchText,page,pageSize);
return Result.success(pageBean);
}
业务层
/**
* 搜索图片列表
* @param searchText
* @param page
* @param pageSize
* @return
*/
@Override
public IPage<Picture> searchPicture(String searchText, Integer page, Integer pageSize) {
//起始索引
Integer startIndex = (page - 1) * pageSize;
String url = String.format("https://cn.bing.com/images/search?q=%s&first=%s",searchText,page);
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (IOException e) {
e.printStackTrace();
}
Elements element = doc.select(".iuscp.isv");
List<Picture> pictureList = new ArrayList<>();
for (Element element1 : element) {
String urlImage = element1.select(".iusc").get(0).attr("m");
Map<String,Object> urlMap = JSONUtil.toBean(urlImage, Map.class);
String urlImg = (String)urlMap.get("murl");
String title = element1.select(".inflnk").get(0).attr("aria-label");
Picture picture = new Picture();
picture.setUrlImage(urlImg);
picture.setTitle(title);
pictureList.add(picture);
if(pictureList.size()>=pageSize){
break;
}
}
IPage<Picture> pagePic = new Page<>(page, pageSize);
pagePic.setRecords(pictureList);
return pagePic;
}