Posts Wrd Doc to Simple HTML Converter
Post
Cancel

Wrd Doc to Simple HTML Converter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
//src - http://www.geoffstratton.com/cnet-docdocx-simple-html-converter
//always you can use 
// object wordApp = Activator.CreateInstance(Type.GetTypeFromProgID("Word.Application"));
using System;
using System.Collections.Generic;
using System.Windows.Forms;
using Microsoft.Office.Interop.Word;
using System.IO;
using System.Diagnostics;
using HtmlAgilityPack;

namespace DocConverter
{
    public partial class docForm : Form
    {
        public docForm()
        {
            InitializeComponent();
            this.AllowDrop = true;
            this.DragEnter += new DragEventHandler(docForm_DragEnter);
            this.DragDrop += new DragEventHandler(docForm_DragDrop);
        }

        void docForm_DragEnter(object sender, DragEventArgs e)
        {
            if (e.Data.GetDataPresent(DataFormats.FileDrop)) e.Effect = DragDropEffects.Copy;
        }

        void docForm_DragDrop(object sender, DragEventArgs e)
        {
            // Gives us the path to the file
            string[] files = (string[])e.Data.GetData(DataFormats.FileDrop);

            // Invoke Word, open doc by path, do doc.SaveAs to generate HTML
            Microsoft.Office.Interop.Word.Application application = new
                Microsoft.Office.Interop.Word.Application();

            Document doc = application.Documents.Open(files[0]);
            string result = Path.GetTempPath();
            //More "complete" but worse HTML
            //doc.SaveAs(result + "temp.html", WdSaveFormat.wdFormatHTML);
            doc.SaveAs(result + "temp.html", WdSaveFormat.wdFormatFilteredHTML);
            doc.Close();

            // Close Word
            application.Quit();

            // Now, clean up Word's HTML using Html Agility Pack
            HtmlAgilityPack.HtmlDocument mangledHTML = new HtmlAgilityPack.HtmlDocument();
            mangledHTML.Load(result + "temp.html");

            //Uncomment to see results so far
            //Process.Start("notepad.exe", result + "temp.html");

            //"Blacklisted" tags and all inclusive data will be removed completely
            //"Stripped" tags will have all attributes removed, so 

 becomes 

            string[] blacklistedTags = { "span", "head" };
            string[] strippedTags = { "body", "div", "p", "strong", "ul", "li", "table", "tr", "td" };

            foreach(var blackTag in blacklistedTags) 
            {
                try
                {
                    foreach (HtmlNode item in mangledHTML.DocumentNode.SelectNodes("//" + blackTag))
                    {
                        item.ParentNode.RemoveChild(item);
                    }
                }
                catch (NullReferenceException)
                {
                    // No tags of that type; skip it and move on
                    continue;
                }
            }

            foreach(var stripTag in strippedTags)
            {
                try
                {
                    foreach (HtmlNode item in mangledHTML.DocumentNode.SelectNodes("//" + stripTag))
                    {
                        item.Attributes.RemoveAll();
                    }
                }
                catch (NullReferenceException)
                {
                    // No tags of that type; skip it and move on
                    continue;
                }
            }

            mangledHTML.Save(result + "newtemp.html");

            // Remove standalone CRLF 
            string badHTML = File.ReadAllText(result + "newtemp.html");
            badHTML = badHTML.Replace("\r\n\r\n", "ackThbbtt");
            badHTML = badHTML.Replace("\r\n", "");
            badHTML = badHTML.Replace("ackThbbtt", "\r\n");
            File.WriteAllText(result + "finaltemp.html", badHTML);

            // Clean up temp files, show the finished result in Notepad
            File.Delete(result + "temp.html");
            File.Delete(result + "newtemp.html");
            Process.Start("notepad.exe", result + "finaltemp.html");
        }

    }

}

origin - https://www.pipiscrew.com/?p=18637 wrd-doc-to-simple-html-converter

This post is licensed under CC BY 4.0 by the author.
Contents

Trending Tags