Initial commit

geoffstratton · Nov 10, 2019 · 7658efb · 7658efb
1 parent 367a196
commit 7658efb
Show file tree

Hide file tree

Showing 2 changed files with 119 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,15 @@
 # Doc/DocX Converter
  Converts Word documents into clean HTML
+
+I have this problem: mo matter what my official job or title, people keep sending me Word documents that they want posted online to match the web site styling.
+
+Yes, you can use Word to convert documents to HTML, but Microsoft's version of "HTML" frequently looks worse than if you just pasted in plain text. And yes, you can save Word documents as plain text, but then to use them on the web you have to add in the HTML tags.
+
+Finally I got fed up and wrote a converter to produce minimally formatted HTML that I can copy into common web editors like CKEditor or TinyMCE. The operation is linear: you take a Word .doc or .docx file, drag it onto a Windows form, the program invokes Word, converts your .doc/.docx to the cleanest HTML that Word can manage, parses the HTML using Html Agility Pack, and finally spits out a simple HTML document in Notepad that you can copy-paste into whatever web system you're using.
+
+Prerequisites:
+
+1. When building this I had the Microsoft.Office.Interop.Word 12.0 (Word 2007) library referenced from the project. The easiest way to meet this requirement is to install some recent version of Office, but any version of the Microsoft.Office.Interop.Word library that natively handles the .docx format should work.
+2. I had the very useful Html Agility Pack version 1.4.6 library referenced as well. I was using .NET 4.0 and the 4.0 version of the library. Unfortunately Html Agility Pack now seems to be abandoned since the last update came in 2012. There is a downloadable .dll for .NET 4.5., but if you want it to work with an application targeting .NET 4.6 or later, you'll have to download the source and build it yourself.
+
+Later I realized a better way to do this might be to invoke the LibreOffice converter on the command line, convert your document to HTML or text, filter it with Python's BeautifulSoup library or sed or Ruby's Nokogiri, and then insert the results straight into the database of your web system. But maybe not: in text, tags like <table> and <ul> would be lost, and LibreOffice's HTML is still pretty ugly.
diff --git a/docx-converter.cs b/docx-converter.cs
@@ -0,0 +1,106 @@
+using System;
+using System.Collections.Generic;
+using System.Windows.Forms;
+using Microsoft.Office.Interop.Word;
+using System.IO;
+using System.Diagnostics;
+using HtmlAgilityPack;
+
+namespace DocConverter
+{
+ public partial class docForm : Form
+ {
+ public docForm()
+ {
+ InitializeComponent();
+ this.AllowDrop = true;
+ this.DragEnter += new DragEventHandler(docForm_DragEnter);
+ this.DragDrop += new DragEventHandler(docForm_DragDrop);
+ }
+
+ void docForm_DragEnter(object sender, DragEventArgs e)
+ {
+ if (e.Data.GetDataPresent(DataFormats.FileDrop)) e.Effect = DragDropEffects.Copy;
+ }
+
+ void docForm_DragDrop(object sender, DragEventArgs e)
+ {
+ // Gives us the path to the file
+ string[] files = (string[])e.Data.GetData(DataFormats.FileDrop);
+
+ // Invoke Word, open doc by path, do doc.SaveAs to generate HTML
+ Microsoft.Office.Interop.Word.Application application = new
+ Microsoft.Office.Interop.Word.Application();
+
+ Document doc = application.Documents.Open(files[0]);
+ string result = Path.GetTempPath();
+ //More "complete" but worse HTML
+ //doc.SaveAs(result + "temp.html", WdSaveFormat.wdFormatHTML);
+ doc.SaveAs(result + "temp.html", WdSaveFormat.wdFormatFilteredHTML);
+ doc.Close();
+
+ // Close Word
+ application.Quit();
+
+ // Now, clean up Word's HTML using Html Agility Pack
+ HtmlAgilityPack.HtmlDocument mangledHTML = new HtmlAgilityPack.HtmlDocument();
+ mangledHTML.Load(result + "temp.html");
+
+ //Uncomment to see results so far
+ //Process.Start("notepad.exe", result + "temp.html");
+
+ //"Blacklisted" tags and all inclusive data will be removed completely
+ //"Stripped" tags will have all attributes removed, so <p class="someclass"> becomes <p>
+ string[] blacklistedTags = { "span", "head" };
+ string[] strippedTags = { "body", "div", "p", "strong", "ul", "li", "table", "tr", "td" };
+
+ foreach(var blackTag in blacklistedTags) 
+ {
+ try
+ {
+ foreach (HtmlNode item in mangledHTML.DocumentNode.SelectNodes("//" + blackTag))
+ {
+ item.ParentNode.RemoveChild(item);
+ }
+ }
+ catch (NullReferenceException)
+ {
+ // No tags of that type; skip it and move on
+ continue;
+ }
+ }
+
+ foreach(var stripTag in strippedTags)
+ {
+ try
+ {
+ foreach (HtmlNode item in mangledHTML.DocumentNode.SelectNodes("//" + stripTag))
+ {
+ item.Attributes.RemoveAll();
+ }
+ }
+ catch (NullReferenceException)
+ {
+ // No tags of that type; skip it and move on
+ continue;
+ }
+ }
+
+ mangledHTML.Save(result + "newtemp.html");
+
+ // Remove standalone CRLF 
+ string badHTML = File.ReadAllText(result + "newtemp.html");
+ badHTML = badHTML.Replace("\r\n\r\n", "ackThbbtt");
+ badHTML = badHTML.Replace("\r\n", "");
+ badHTML = badHTML.Replace("ackThbbtt", "\r\n");
+ File.WriteAllText(result + "finaltemp.html", badHTML);
+
+ // Clean up temp files, show the finished result in Notepad
+ File.Delete(result + "temp.html");
+ File.Delete(result + "newtemp.html");
+ Process.Start("notepad.exe", result + "finaltemp.html");
+ }
+
+ }
+
+}