안녕하세요.
아래와 같이 Langchain으로 변환했는데 source 부분은 있으나, page_content 부분이 공란으로 나옵니다.
[소스코드]
import json
import os
import subprocess
from langchain_community.document_loaders import UnstructuredHTMLLoader
from pathlib import Path
import base64
import http.client
from tqdm import tqdm
import requests
# HTML 파일이 들어있는 폴더
html_files_dir = Path('/home/embeding/clovastudioguide')
html_files = list(html_files_dir.glob("*.html"))
clovastudiodatas = []
for html_file in html_files:
loader = UnstructuredHTMLLoader(str(html_file))
document_data = loader.load()
print(document_data)
clovastudiodatas.append(document_data)
print(f"Processed {html_file}")
[실행결과]
[Document(page_content='', metadata={'source': '/home/embeding/clovastudioguide/clovastudio-glossary.html'})]
Processed /home/embeding/clovastudioguide/clovastudio-glossary.html
[Document(page_content='', metadata={'source': '/home/embeding/clovastudioguide/clovastudio-screen.html'})]
Processed /home/embeding/clovastudioguide/clovastudio-screen.html
[Document(page_content='', metadata={'source': '/home/embeding/clovastudioguide/clovastudio-info.html'})]
Processed /home/embeding/clovastudioguide/clovastudio-info.html
[Document(page_content='', metadata={'source': '/home/embeding/clovastudioguide/clovastudio-playground.html'})]
Processed /home/embeding/clovastudioguide/clovastudio-playground.html
[Document(page_content='', metadata={'source': '/home/embeding/clovastudioguide/clovastudio-start.html'})]
Processed /home/embeding/clovastudioguide/clovastudio-start.html
[Document(page_content='', metadata={'source': '/home/embeding/clovastudioguide/clovastudio-procedure.html'})]
Processed /home/embeding/clovastudioguide/clovastudio-procedure.html
[Document(page_content='', metadata={'source': '/home/embeding/clovastudioguide/clovastudio-playground01.html'})]