-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf-to-markdown.py
230 lines (197 loc) · 8.18 KB
/
pdf-to-markdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import os
import glob
import configparser
import logging
import openai as OpenAI
import requests
import base64
import aiohttp
import random
import asyncio
import shutil
from aiohttp import client_exceptions
from pdf2image import convert_from_path
from PIL import Image
from multiprocessing import Pool
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Read configuration
config = configparser.ConfigParser()
config.read('config.ini')
openai_api_key = config.get('OpenAI', 'API_KEY')
SOURCE_DIRECTORY = config.get('Directories', 'SOURCE_DIRECTORY')
IMAGE_LABEL_PROMPT = """
I have given you an image. Your goal is to simply convert the text of the image to markdown:
* Transcribe the recognized text into Markdown syntax.
* Preserve basic formatting elements such as headers, lists, bold, and italics.
* Convert image captions and annotations into appropriate Markdown or alt text formats.
* If there's a chart or table try your best to preserve that in markdown.
OUTPUT NOTHING ELSE BESIDES THE TEXT FROM THE IMAGE
"""
def save_image(image, i, pdf_path):
png_dir = os.path.join(os.path.dirname(pdf_path), 'pngs')
os.makedirs(png_dir, exist_ok=True)
image_path = os.path.join(png_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_{i}.png")
image.save(image_path, 'PNG')
return image_path
def convert_pdf_to_images(pdf_path):
images = convert_from_path(pdf_path)
with Pool() as p:
image_files = p.starmap(save_image, [(image, i, pdf_path) for i, image in enumerate(images)])
return image_files
def encode_image(image):
image_bytes = image.save(fp=None, format='JPEG')
return base64.b64encode(image_bytes).decode('utf-8')
def find_images(directory):
extensions = ['png', 'jpg', 'jpeg', 'gif', 'webp']
files = []
for ext in extensions:
files.extend(glob.glob(f"{directory}/**/*.{ext}", recursive=True))
return files
def encode_image(image_path):
try:
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
except Exception as e:
logger.error(f"Failed to encode image {image_path}: {e}")
return None
async def label_image_async(session, image_path, openai_api_key, max_retries=5, initial_delay=1.0):
base64_image = encode_image(image_path)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {openai_api_key}"
}
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": IMAGE_LABEL_PROMPT,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "low",
}
}
]
}
],
"max_tokens": 400,
}
# Use GPT-4 Turbo to label the image
delay = initial_delay
for attempt in range(max_retries):
try:
async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) as response:
if response.status == 200:
data = await response.json()
return data["choices"][0]["message"]["content"]
elif response.status == 429:
logger.warning("Rate limit error: backing off and retrying")
if attempt < max_retries - 1:
await asyncio.sleep(delay + random.uniform(0, 1))
delay *= 2
else:
return f"Error: {response.status}"
except client_exceptions.ServerDisconnectedError:
logger.warning("Server disconnected: backing off and retrying")
if attempt < max_retries - 1:
await asyncio.sleep(delay + random.uniform(0, 1))
delay *= 2
else:
return f"Error: Server disconnected"
async def get_labels(image_files):
labels = []
async with aiohttp.ClientSession() as session:
tasks = [label_image_async(session, image_file, openai_api_key) for image_file in image_files]
labels = await asyncio.gather(*tasks)
return labels
def validate_directory(directory):
"""Validate if the provided directory path exists and is a directory."""
if not os.path.exists(directory):
logger.error(f"Directory does not exist: {directory}")
return False
if not os.path.isdir(directory):
logger.error(f"Provided path is not a directory: {directory}")
return False
return True
def sanitize_label(label):
invalid_chars = ['/', ':', '*', '?', '"', '<', '>', '|']
for char in invalid_chars:
label = label.replace(char, '_')
return label
def label_and_move_images(src_path, ask_to_proceed=True, debug_output=False, keep_originals=False):
logger.info(f"Searching for images in {src_path}")
print(f"Searching for images in {src_path}")
image_files = find_images(src_path)
if debug_output:
for file in image_files:
print(f"Found file: {file}")
num_images = len(image_files)
num_input_tokens = num_images * 350 # Example calculation
num_output_tokens = num_images * 10
openai_price = num_input_tokens * 0.00001 + num_output_tokens * 0.00003
print(f"Calculated cost: ${openai_price}")
if ask_to_proceed:
proceed = input(f"Found {num_images} images. Proceed with classification? (y/n) ")
if proceed.lower() != 'y':
return
print("Labelling image files. This may take a while.")
labels = asyncio.run(get_labels(image_files))
print(f"Retrieved labels from OpenAI. Moving to sorted folder within {src_path}")
dst_path = os.path.join(src_path, "sorted")
os.makedirs(dst_path, exist_ok=True)
for image, label in zip(image_files, labels):
folder_name = label.split("_")[0]
folder_path = os.path.join(dst_path, folder_name + "s")
os.makedirs(folder_path, exist_ok=True)
sanitized_label = sanitize_label(label[len(folder_name)+1:])
shutil.copy(image, os.path.join(folder_path, sanitized_label) + "." + image.split(".")[-1])
if not keep_originals:
os.remove(image) # Delete the original image
HOME = os.path.expanduser('~') # Home path
async def get_label(image_file):
async with aiohttp.ClientSession() as session:
return await label_image_async(session, image_file, openai_api_key)
def write_to_markdown(label, markdown_file):
md_dir = os.path.join(os.path.dirname(markdown_file), 'markdown')
os.makedirs(md_dir, exist_ok=True)
markdown_path = os.path.join(md_dir, os.path.basename(markdown_file))
with open(markdown_path, 'a') as f:
f.write(f"{label}\n\n")
def process_file(file_path):
# Get the file extension
file_extension = os.path.splitext(file_path)[1].lower()
# List of supported image extensions
supported_image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp']
# Check if the file is a PDF or an image
if file_extension == '.pdf':
# Convert PDF to images
image_files = convert_pdf_to_images(file_path)
elif file_extension in supported_image_extensions:
# If it's an image, no need for conversion
image_files = [file_path]
else:
print(f"Unsupported file type: {file_extension}")
return
# Convert images to markdown
markdown_file = f"{os.path.splitext(file_path)[0]}.md"
for image_file in image_files:
label = asyncio.run(get_label(image_file))
write_to_markdown(label, markdown_file)
if __name__ == "__main__":
if validate_directory(SOURCE_DIRECTORY):
# Find all PDF and image files
files = []
for ext in ['pdf', 'png', 'jpg', 'jpeg', 'gif', 'webp']:
files.extend(glob.glob(f"{SOURCE_DIRECTORY}/**/*.{ext}", recursive=True))
for file in files:
process_file(file)
else:
print("Invalid source directory. Please check the path and try again.")