-
Notifications
You must be signed in to change notification settings - Fork 0
/
caj2txt.py
executable file
·57 lines (42 loc) · 1.79 KB
/
caj2txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from caj2pdf import caj2pdf
from pdfminer.high_level import extract_text
import os,re
import glob
#定义你下载的一堆caj文件目录
caj_path = "/home/ubuntu/caj2txt/cajcopy"
#定义你的txt输出目录
output_path = "/home/ubuntu/caj2txt/txt"
def batch_convert_to_text(caj_path):
# 获取指定文件夹下所有的 .caj 文件路径
caj_files = glob.glob(os.path.join(caj_path, "*.caj"))
for caj_file in caj_files:
# 转换为 PDF
caj2pdf.convert_to_pdf(caj_file)
def pdf_to_text(caj_path, output_path):
# 读取生成pdf位置
pdf_files = glob.glob(os.path.join(caj_path, "*.pdf"))
for pdf in pdf_files:
# 构建输出文件路径,将PDF扩展名改为txt
txt_output_path = os.path.join(output_path, re.sub(r'\.pdf$', '.txt', os.path.basename(pdf)))
# PDF 转换为文本并保存
text = extract_text(pdf)
with open(txt_output_path, 'w', encoding='utf-8') as f:
f.write(text)
#开关caj转pdf
batch_convert_to_text(caj_path)
# 开关pdf转txt
#pdf_to_text(caj_path, output_path)
# 原作者把输出封装了当前目录
# 因本人编程不是很好,只会用GPT拉屎,为了目录结构整洁,统一在最后将同一目录(./caj)下pdf移动到./pdf目录下,./txt文件不变
# 源文件夹路径
#source_folder = './caj'
# 目标文件夹路径
#destination_folder = './pdf'
# 遍历源文件夹中的文件
#for filename in os.listdir(source_folder):
# 如果文件是以.pdf结尾的文件,则移动到目标文件夹
# if filename.endswith('.pdf'):
# source_path = os.path.join(source_folder, filename)
# destination_path = os.path.join(destination_folder, filename)
# os.rename(source_path, destination_path)
# print(f'Moved {filename} to {destination_folder}')