1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
//src - http://www.geoffstratton.com/cnet-docdocx-simple-html-converter
//always you can use
// object wordApp = Activator.CreateInstance(Type.GetTypeFromProgID("Word.Application"));
using System;
using System.Collections.Generic;
using System.Windows.Forms;
using Microsoft.Office.Interop.Word;
using System.IO;
using System.Diagnostics;
using HtmlAgilityPack;
namespace DocConverter
{
public partial class docForm : Form
{
public docForm()
{
InitializeComponent();
this.AllowDrop = true;
this.DragEnter += new DragEventHandler(docForm_DragEnter);
this.DragDrop += new DragEventHandler(docForm_DragDrop);
}
void docForm_DragEnter(object sender, DragEventArgs e)
{
if (e.Data.GetDataPresent(DataFormats.FileDrop)) e.Effect = DragDropEffects.Copy;
}
void docForm_DragDrop(object sender, DragEventArgs e)
{
// Gives us the path to the file
string[] files = (string[])e.Data.GetData(DataFormats.FileDrop);
// Invoke Word, open doc by path, do doc.SaveAs to generate HTML
Microsoft.Office.Interop.Word.Application application = new
Microsoft.Office.Interop.Word.Application();
Document doc = application.Documents.Open(files[0]);
string result = Path.GetTempPath();
//More "complete" but worse HTML
//doc.SaveAs(result + "temp.html", WdSaveFormat.wdFormatHTML);
doc.SaveAs(result + "temp.html", WdSaveFormat.wdFormatFilteredHTML);
doc.Close();
// Close Word
application.Quit();
// Now, clean up Word's HTML using Html Agility Pack
HtmlAgilityPack.HtmlDocument mangledHTML = new HtmlAgilityPack.HtmlDocument();
mangledHTML.Load(result + "temp.html");
//Uncomment to see results so far
//Process.Start("notepad.exe", result + "temp.html");
//"Blacklisted" tags and all inclusive data will be removed completely
//"Stripped" tags will have all attributes removed, so
becomes
string[] blacklistedTags = { "span", "head" };
string[] strippedTags = { "body", "div", "p", "strong", "ul", "li", "table", "tr", "td" };
foreach(var blackTag in blacklistedTags)
{
try
{
foreach (HtmlNode item in mangledHTML.DocumentNode.SelectNodes("//" + blackTag))
{
item.ParentNode.RemoveChild(item);
}
}
catch (NullReferenceException)
{
// No tags of that type; skip it and move on
continue;
}
}
foreach(var stripTag in strippedTags)
{
try
{
foreach (HtmlNode item in mangledHTML.DocumentNode.SelectNodes("//" + stripTag))
{
item.Attributes.RemoveAll();
}
}
catch (NullReferenceException)
{
// No tags of that type; skip it and move on
continue;
}
}
mangledHTML.Save(result + "newtemp.html");
// Remove standalone CRLF
string badHTML = File.ReadAllText(result + "newtemp.html");
badHTML = badHTML.Replace("\r\n\r\n", "ackThbbtt");
badHTML = badHTML.Replace("\r\n", "");
badHTML = badHTML.Replace("ackThbbtt", "\r\n");
File.WriteAllText(result + "finaltemp.html", badHTML);
// Clean up temp files, show the finished result in Notepad
File.Delete(result + "temp.html");
File.Delete(result + "newtemp.html");
Process.Start("notepad.exe", result + "finaltemp.html");
}
}
}
origin - https://www.pipiscrew.com/?p=18637 wrd-doc-to-simple-html-converter