How to prepare your PDF docs to Smabbler API RAG using Open-Parse?
Getting started
pip install openparseData processing
import openparse
import csv
import os
maxFileTextLength=2000000
def getFileText(filename):
filepath = f"./{filename}"
parser = openparse.DocumentParser()
parsedDoc = parser.parse(filepath)
texts=[]
for node in parsedDoc.nodes:
texts.append(node.text)
mergedText="\r\n".join(texts)
return mergedText[:maxFileTextLength]
def getPdfFiles():
files = [f for f in os.listdir('.') if os.path.isfile(f) & f.endswith(".pdf")]
return files
def exportFileTextsToCsv(fileTexts):
with open("file-export.csv", "w", newline='',encoding="utf-8") as outfile:
csvwriter = csv.writer(outfile, delimiter=';',
quotechar='"', quoting=csv.QUOTE_MINIMAL)
csvwriter.writerow(["filename","text"])
for fileText in fileTexts:
csvwriter.writerow([fileText[0],fileText[1]]);
fileTexts=[]
files=getPdfFiles()
for file in files:
fileText=getFileText(file)
fileTexts.append((file,fileText))
exportFileTextsToCsv(fileTexts)Last updated