基于streamlit搭简易前端页面
前端小白第一次用streamlit搭简易页面,记录一下。
一些tips
每次与页面进行交互,如点击按钮、上传文件等,streamlit就会重新运行整个页面的所有代码。如果在页面渲染前需要对上传文件做很复杂的操作,重新运行所有代码就会重复这个过程,会导致页面加载内容较慢。因此可以把不会变化的内容存起来,避免重新对文件进行处理。
streamlit的渲染顺序和代码定义的分布一致,例如先在代码里写了一级标题就先渲染一级标题,先写占位符就先渲染占位符。所以排序在前的部分渲染时未渲染部分会是灰色。
一些命令
将页面分成两列
left_col, right_col = st.columns([1, 1])
在占位符里动态渲染变化内容
left_content_placeholder = st.empty()
with left_content_placeholder.container():
st.components.v1.html(st.session_state['current_content'], height=600, scrolling=True)
在页面加载时滚动到当前匹配的结果
scroll_script = """
<script>
document.addEventListener('DOMContentLoaded', function() {
var element = document.getElementById('current_match');
if(element) {
element.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
});
</script>
"""
代码示例
import streamlit as st
from docx import Document
import mammoth
import os
from bs4 import BeautifulSoup
import re
from PIL import Image
import base64
import markdown
# 解析Word文档的函数,保留原格式
def parse_word_document_for_table(file):
try:
# 使用Mammoth将文档转换为HTML以保留格式
result = mammoth.convert_to_html(file)
html_content = result.value
# 使用python-docx处理表格部分
document = Document(file)
tables_html = ""
# 存储所有表格的数据
all_tables_data = []
# 遍历文档中的所有段落和表格
prev_paragraph = ""
for element in document.element.body:
if element.tag.endswith('tbl'): # 检查是否为表格元素
# 获取当前表格
table = document.tables[len(all_tables_data)] # 获取当前表格
# 使用前一个段落作为表格标题
table_title = prev_paragraph.strip() if prev_paragraph else "无标题"
# 在表格前添加标题
tables_html += f"<h3>{table_title}</h3>" # 这里使用<h3>标签来显示标题,或者根据需要使用其他标签
tables_html += "<table border='1' style='border: 1px solid black; border-collapse: collapse; width: 100%;'>"
for row in table.rows:
tables_html += "<tr>"
for cell in row.cells:
cell_content = cell.text.replace('\n', '<br>') # 处理单元格内换行
tables_html += f"<td style='padding: 5px; border: 1px solid black; text-align: left; vertical-align: top;'>{cell_content}</td>"
tables_html += "</tr>"
tables_html += "</table><br>"
all_tables_data.append(1)
elif element.tag.endswith('p'): # 检查是否为段落
prev_paragraph = element.text # 获取当前段落的文本作为表格的标题
# 将表格的HTML拼接到文档内容中
# html_content += tables_html
html_content = tables_html
return html_content
except Exception as e:
return f"Error: {str(e)}"
def parse_word_document(file):
try:
# 使用Mammoth将文档转换为HTML以保留格式
result = mammoth.convert_to_html(file)
html_content = result.value
return html_content
except Exception as e:
return f"Error: {str(e)}"
def extract_markdown(file):
content_bytes = file.read()
content_text = content_bytes.decode('utf-8')
html = markdown.markdown(content_text)
return html
def main():
DOCUMENT_PATHS = {
"标题验证": "./headings.md",
"表格信息提取": "./tables_information.md",
"表格验证结果": "./output.md"
}
# 设置页面为宽布局
st.set_page_config(page_title="项目报告核验", layout="wide")
logo_path = './logo.jpg'
st.image(logo_path, width=200)
st.markdown("<h1 style='text-align: center;'>项目报告核验</h1>", unsafe_allow_html=True)
# 使用自定义 CSS 来居中标题
st.markdown("""
<style>
.left-header {
display: flex;
justify-content: center;
align-items: center;
height: 100%;
text-align: center;
}
.right-header {
display: flex;
justify-content: center;
align-items: center;
height: 100%;
text-align: center;
}
</style>
""", unsafe_allow_html=True)
# 创建两列布局,左边显示上传文档,右边显示a、b、c文档
left_col, right_col = st.columns([1, 1]) # 调整列的宽度比例,左边更宽
# 初始化状态
if 'first_doc_uploaded' not in st.session_state:
st.session_state['first_doc_uploaded'] = True
if 'left_doc_uploaded' not in st.session_state:
st.session_state['left_doc_uploaded'] = False
if 'right_doc_content' not in st.session_state:
st.session_state['right_doc_content'] = ""
if 'right_doc_error' not in st.session_state:
st.session_state['right_doc_error'] = ""
if 'matches' not in st.session_state:
st.session_state['matches'] = []
if 'current_index' not in st.session_state:
st.session_state['current_index'] = -1
if 'first_render' not in st.session_state:
st.session_state['first_render'] = True
with right_col:
st.markdown('<div class="right-header"><h2>内容展示区域</h2></div>', unsafe_allow_html=True)
tab_selection2 = st.selectbox(
"选择展示内容:",
("标题验证", "表格信息提取", "表格验证结果")
)
# 上传文档部分(左列)
with left_col:
st.markdown('<div class="left-header"><h2>项目文件上传及原文展示</h2></div>', unsafe_allow_html=True)
uploaded_file = st.file_uploader("上传Word文档", type=["docx"])
search_keyword = st.text_input("输入要搜索的关键词")
search_button = st.button("查找/查找下一个")
left_content_placeholder = st.empty()
# 处理文件上传逻辑
if not uploaded_file:
st.session_state['current_content'] = None
st.session_state['left_doc_uploaded'] = False
if not st.session_state['current_content'] and uploaded_file:
try:
# 解析上传的Word文档
content = parse_word_document_for_table(uploaded_file)
st.session_state['left_doc_uploaded'] = True
st.session_state['current_content'] = content
except Exception as e:
st.error(str(e))
# if st.session_state['left_doc_uploaded']:
with left_content_placeholder.container():
st.components.v1.html(st.session_state['current_content'], height=600, scrolling=True)
# 处理搜索按钮点击事件,只有在文档已上传后才生效
if st.session_state['left_doc_uploaded'] and search_button:
if search_keyword!= st.session_state.get('last_search_keyword', ''):
# 关键词变化了,重新执行搜索
st.session_state['last_search_keyword'] = search_keyword
st.session_state['matches'] = [] # 清空上次的匹配项
st.session_state['current_index'] = -1 # 重置为-1,表示没有匹配项
content = parse_word_document_for_table(uploaded_file)
st.session_state['current_content'] = content # 重置为原始文档内容
# 执行新的搜索
if search_keyword:
soup_new_search = BeautifulSoup(content, 'html.parser')
paragraphs = soup_new_search.find_all(['p', 'span', 'div', 'td', 'h3'])
pattern = re.compile(re.escape(search_keyword), re.IGNORECASE)
matches = []
for idx, paragraph in enumerate(paragraphs):
if pattern.search(paragraph.get_text()):
highlighted_text = pattern.sub(lambda match: f"<mark style='background-color: yellow;'>{match.group(0)}</mark>", paragraph.decode_contents())
paragraph.clear()
paragraph.append(BeautifulSoup(highlighted_text, 'html.parser'))
matches.append(paragraph)
st.session_state['matches'] = matches
st.session_state['current_index'] = 0 # 重新开始从第一个匹配项
st.session_state['current_content'] = str(soup_new_search) # 更新文档内容为高亮后的内容
if matches:
for idx, paragraph in enumerate(matches):
if idx == st.session_state['current_index']:
paragraph['id'] = 'current_match'
paragraph['style'] = 'background-color: orange;' # 当前匹配项用橙色高亮
else:
paragraph['id'] = ''
paragraph['style'] = 'background-color: yellow;' # 其他匹配项用黄色高亮
# 保存高亮后的内容
st.session_state['current_content'] = str(soup_new_search)
elif search_keyword == st.session_state.get('last_search_keyword', ''):
# 关键词没有变化,查找下一个匹配项
if st.session_state['matches']:
# 只有在有匹配项时才进行查找
st.session_state['current_index'] = (st.session_state['current_index'] + 1) % len(st.session_state['matches'])
soup_repeat = BeautifulSoup(st.session_state['current_content'], 'html.parser')
paragraphs = soup_repeat.find_all(['p', 'span', 'div', 'td', 'h3'])
pattern = re.compile(re.escape(search_keyword), re.IGNORECASE)
# 更新高亮颜色
idx =-1
for paragraph in paragraphs:
# 对当前匹配项和上一项进行样式更新
if pattern.search(paragraph.get_text()):
idx +=1
if idx == st.session_state['current_index']:
# 当前匹配项用橙色高亮
paragraph['id'] = 'current_match'
paragraph['style'] = 'background-color: orange;'
else:
# 其他匹配项用黄色高亮
paragraph['id'] = ''
paragraph['style'] = 'background-color: yellow;'
# 保存高亮后的内容
st.session_state['current_content'] = str(soup_repeat)
#在页面加载时滚动到当前匹配的结果
scroll_script = """
<script>
document.addEventListener('DOMContentLoaded', function() {
var element = document.getElementById('current_match');
if(element) {
element.scrollIntoView({ behavior: 'smooth', block: 'center' });
}
});
</script>
"""
with left_content_placeholder.container():
st.components.v1.html(st.session_state['current_content']+ scroll_script , height=600, scrolling=True)
with right_col:
right_content_placeholder = st.empty()
if st.session_state['left_doc_uploaded']:
selected_document = DOCUMENT_PATHS.get(tab_selection2)
else:
selected_document = None
st.session_state['right_doc_content'] = ""
right_content_placeholder.empty()
if 'right_selected_doc' not in st.session_state:
st.session_state['right_selected_doc'] = None
if selected_document:
try:
with open(selected_document, "rb") as file:
content = extract_markdown(file)
if content.startswith("Error"):
st.session_state['right_doc_error'] = content
else:
st.session_state['right_doc_content'] = content
st.session_state['right_selected_doc'] = selected_document
except Exception as e:
st.error(str(e))
if st.session_state["right_doc_content"]:
with right_content_placeholder.container():
st.markdown(f"""
<div style="height: 750px; overflow-y: scroll; padding: 10px; border: 1px solid #ccc; border-radius: 5px;">
{st.session_state["right_doc_content"]}
</div>
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()