当前位置: 首页 > article >正文

C HTML格式解析与生成之gumbo

测试

#include <fstream>
#include <iostream>
#include <stdlib.h>
#include <string>

#include "../src/gumbo.h"

// 提取纯文本内容
static std::string cleantext(GumboNode* node) {
  if (node->type == GUMBO_NODE_TEXT) {
    return std::string(node->v.text.text);
  } else if (node->type == GUMBO_NODE_ELEMENT &&
             node->v.element.tag != GUMBO_TAG_SCRIPT &&
             node->v.element.tag != GUMBO_TAG_STYLE) {
    std::string contents = "";
    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) {
      const std::string text = cleantext((GumboNode*) children->data[i]);
      if (i != 0 && !text.empty()) {
        contents.append(" ");
      }
      contents.append(text);
    }
    return contents;
  } else {
    return "";
  }
}

// 提取url链接
static void search_for_links(GumboNode* node) {
  if (node->type != GUMBO_NODE_ELEMENT) {
    return;
  }
  GumboAttribute* href;
  if (node->v.element.tag == GUMBO_TAG_A &&
      (href = gumbo_get_attribute(&node->v.element.attributes, "href"))) {
    std::cout << href->value << std::endl;
  }

  GumboVector* children = &node->v.element.children;
  for (unsigned int i = 0; i < children->length; ++i) {
    search_for_links(static_cast<GumboNode*>(children->data[i]));
  }
}

// 查找标题
#include <assert.h>
static const char* find_title(const GumboNode* root) {
  assert(root->type == GUMBO_NODE_ELEMENT);
  assert(root->v.element.children.length >= 2);

  const GumboVector* root_children = &root->v.element.children;
  GumboNode* head = NULL;
  for (int i = 0; i < root_children->length; ++i) {
    GumboNode* child = (GumboNode*) root_children->data[i];
    if (child->type == GUMBO_NODE_ELEMENT &&
        child->v.element.tag == GUMBO_TAG_HEAD) {
      head = child;
      break;
    }
  }
  assert(head != NULL);

  GumboVector* head_children = &head->v.element.children;
  for (int i = 0; i < head_children->length; ++i) {
    GumboNode* child = (GumboNode*) head_children->data[i];
    if (child->type == GUMBO_NODE_ELEMENT &&
        child->v.element.tag == GUMBO_TAG_TITLE) {
      if (child->v.element.children.length != 1) {
        return "<empty title>";
      }
      GumboNode* title_text = (GumboNode *) child->v.element.children.data[0];
      assert(title_text->type == GUMBO_NODE_TEXT ||
             title_text->type == GUMBO_NODE_WHITESPACE);
      return title_text->v.text.text;
    }
  }
  return "<no title found>";
}


static std::string find_line(
    const std::string& original_text, const GumboAttribute& attr) {
  size_t attr_index = attr.original_value.data - original_text.data();
  size_t begin = original_text.rfind("\n", attr_index) + 1;
  size_t end = original_text.find("\n", attr_index);
  if (end != std::string::npos) {
    end--;
  } else {
    end = (size_t) original_text.length() - 1;
  }
  end = std::min(end, attr_index + 40);
  begin = std::max(begin, attr_index - 40);
  return original_text.substr(begin, end - begin);
}

// 查找class元素
static void search_for_class(
    GumboNode* node, const std::string& original_text, const char* cls_name) {
  if (node->type != GUMBO_NODE_ELEMENT) {
    return;
  }
  GumboAttribute* cls_attr;
  if ((cls_attr = gumbo_get_attribute(&node->v.element.attributes, "class")) &&
      strstr(cls_attr->value, cls_name) != NULL) {
    std::cout << cls_attr->value_start.line << ":"
              << cls_attr->value_start.column << " - "
              << find_line(original_text, *cls_attr) << std::endl;
  }

  GumboVector* children = &node->v.element.children;
  for (int i = 0; i < children->length; ++i) {
    search_for_class(
        static_cast<GumboNode*>(children->data[i]), original_text, cls_name);
  }
}

int main(int argc, char** argv) {
  if (argc != 2) {
    std::cout << "Usage: clean_text <html filename>\n";
    exit(EXIT_FAILURE);
  }
  const char* filename = argv[1];

  std::ifstream in(filename, std::ios::in | std::ios::binary);
  if (!in) {
    std::cout << "File " << filename << " not found!\n";
    exit(EXIT_FAILURE);
  }

  std::string contents;
  in.seekg(0, std::ios::end);
  contents.resize(in.tellg());
  in.seekg(0, std::ios::beg);
  in.read(&contents[0], contents.size());
  in.close();

  GumboOutput* output = gumbo_parse_with_options(
      &kGumboDefaultOptions, contents.data(), contents.length());
  std::cout << cleantext(output->root) << std::endl;
  //search_for_links(output->root);
  //find_title(output->root);
  //const char* cls = "article";
  //search_for_class(output->root, contents, cls);
  gumbo_destroy_output(&kGumboDefaultOptions, output);
}
效果

参考

C HTML格式解析与生成-CSDN博客

GitHub - google/gumbo-parser: An HTML5 parsing library in pure C99


创作不易,小小的支持一下吧!


http://www.kler.cn/a/310653.html

相关文章:

  • 微服务架构面试内容整理-SpringCloud Netflix‌与Spring Cloud Alibaba比较
  • 15 个改变世界的开源项目:塑造现代技术的先锋力量
  • Mysql前言
  • 介绍几个提取视频文案的Coze插件
  • 电脑不显示wifi列表怎么办?电脑不显示WiF列表的解决办法
  • 股票投资学习路线图
  • python怎么输入整数
  • 万能小程序运营管理系统 _requestPost 任意文件读取漏洞复现
  • DAY20240911 VUE:解锁前端路由的奥秘:如何在单页应用中避免404困境?
  • 流量牵引技术与传统防火墙的区别
  • 在网络环境中怎么保护个人信息安全?
  • 土壤墒情测定仪的工作原理
  • 汽车软件开发之敏捷开发
  • Spring 源码解读:手动实现Spring事件机制
  • JSON.parseArray 内存溢出
  • 【第十一章:Sentosa_DSML社区版-机器学习分类】
  • Oracle数据库高级技术探秘:分区表管理与代码实战
  • Python 全栈系列271 微服务踩坑记
  • 数据库学习02——mysql清空表数据后 IBD 文件仍很大的解决方案
  • 面向开发者的LLM入门教程(学习笔记01)
  • 探索学习Python的最佳开发环境和编辑器
  • 家用燃气报警器-家庭可燃气体探测器-旭华智能
  • 【网络安全】服务基础第二阶段——第四节:Linux系统管理基础----Linux网络与日志服务器
  • Docker 镜像制作(Dockerfile)
  • 为解决bypy大文件上传报错—获取百度云文件直链并使用Aria2上传文件至服务器
  • Mini-Omni:语言模型可以在流中听、说和思考