Show / Hide Table of Contents

How to extract the plain text of the whole book

Example

Important

In order to run this example, you need to install the HtmlAgilityPack Nuget package.

using System;
using System.Text;
using HtmlAgilityPack;

namespace VersOne.Epub.ConsoleDemo
{
    internal static class ExtractPlainText
    {
        public static void Run(string filePath)
        {
            EpubBook book = EpubReader.ReadBook(filePath);
            foreach (EpubLocalTextContentFile textContentFile in book.ReadingOrder)
            {
                PrintTextContentFile(textContentFile);
            }
        }

        private static void PrintTextContentFile(EpubLocalTextContentFile textContentFile)
        {
            HtmlDocument htmlDocument = new();
            htmlDocument.LoadHtml(textContentFile.Content);
            StringBuilder sb = new();
            foreach (HtmlNode node in htmlDocument.DocumentNode.SelectNodes("//text()"))
            {
                sb.AppendLine(node.InnerText.Trim());
            }
            string contentText = sb.ToString();
            Console.WriteLine(contentText);
            Console.WriteLine();
        }
    }
}

See it in action

Download the .NET console demo app, run it, and select the 2. Extract plain text from the whole book option.

  • Edit this page
In this article
Back to top Generated by DocFX