Index PDF content

I just came to think that building an index of PDF content is part of the T3 training. It's rather new, so I thought I'd just share it, in case you won't be visiting that part of our doc site.
I found that the AbcPDF did a quite poor job, especially by not cleaning up after itself, so I ended up using the iTextSharp library:
using Dynamicweb.Diagnostics.Tracking;
using Dynamicweb.Indexing.Schemas;
using Dynamicweb.Configuration;
using Dynamicweb.Content.Files;
using System;
using System.IO;
using System.Linq;
using System.Collections.Generic;
using iTextSharp;

namespace Dynamicweb.Indexing
{
    public class CustomPDFIndexBuilder : IndexBuilderBase
    {
        // No http context available - getting domain from custom setting. Used for building complete link to file.
        private string Domain = SystemConfiguration.Instance.GetValue("/Globalsettings/Settings/CustomPDFFileIndexer/Domain"); // your-domain.com
        private string StartFolder = FilesAndFolders.GetFilesFolderName();

        /// <summary>
        /// List of supported actions
        /// </summary>
        public override IEnumerable<string> SupportedActions
        {
            get
            {
                return new string[] { "Full", "Update" };
            }
        }
        /// <summary>
        /// Gets default settings collection
        /// </summary>
        public override IDictionary<string, object> DefaultSettings
        {
            get { return new Dictionary<string, object> { { "StartFolder", StartFolder }, { "Domain", Domain } }; }
        }

        /// <summary>
        /// Default constructor
        /// </summary>
        public CustomPDFIndexBuilder()
        {
            Action = "Full";
            Settings = new Dictionary<string, string>();
        }

        /// <summary>
        /// Creates new object using settings data
        /// </summary>
        /// <param name="settings"></param>
        public CustomPDFIndexBuilder(IDictionary<string, string> settings)
        {
            Action = "Full";
            Settings = settings;
        }

        /// <summary>
        /// Gets index builder fields
        /// </summary>
        /// <returns>Set of key-value pairs</returns>        
        public override IEnumerable<FieldDefinitionBase> GetFields()
        {
            FileIndexSchemaExtender extender = new FileIndexSchemaExtender();
            var schemaExtenderFields = extender.GetFields() as List<FieldDefinitionBase>;

            // Add your custom fields
            if (schemaExtenderFields != null)
            {
                schemaExtenderFields.Add(new FieldDefinition() { Name = "Text Content", SystemName = "TextContent", Source = "TextContent", TypeName = "System.String", Group = "PDF Specific", Indexed = true, Analyzed = false, Stored = true });
                schemaExtenderFields.Add(new FieldDefinition() { Name = "Link to file", SystemName = "LinktToFile", Source = "LinkToFile", TypeName = "System.String", Group = "PDF Specific", Indexed = true, Analyzed = false, Stored = true });
            }
            return schemaExtenderFields;
        }

        /// <summary>
        /// Builds current sql index
        /// </summary>
        /// <param name="writer"></param>
        /// <param name="tracker"></param>        
        public override void Build(IIndexWriter writer, Tracker tracker)
        {
            string directory = string.Empty;
            tracker.LogInformation("{0} building using {1}", GetType().FullName, writer.GetType().FullName);
            try
            {
                tracker.LogInformation("Opening index writer.");
                writer.Open(false);
                tracker.LogInformation("Opened index writer to overwrite index");

                //load builder settings
                if (Settings.ContainsKey("StartFolder"))
                    StartFolder = Settings["StartFolder"];

                if (Settings.ContainsKey("Domain"))
                    Domain = Settings["Domain"];

                tracker.LogInformation("StartFolder: '{0}'", StartFolder);
                tracker.LogInformation("Domain: '{0}'", Domain);

                if (Action.Equals("Full"))
                {
                    //process files
                    tracker.LogInformation("Starting processing files.");
                    directory = Core.SystemInformation.MapPath("/Files/") + "\\" + StartFolder.Trim(new char[] { '/', '\\' });
                    if (Directory.Exists(directory))
                    {
                        List<string> fileList = FileList(directory, tracker);
                        tracker.Status.TotalCount = fileList.Count();

                        foreach (string file in fileList)
                        {
                            try
                            {
                                FileInfo fileInfo = new FileInfo(file);
                                IndexDocument document = new IndexDocument();
                                document["FileName"] = fileInfo.Name;
                                document["FileFullName"] = fileInfo.FullName;
                                document["LinkToFile"] = LinkToFile(fileInfo.FullName);
                                document["Extension"] = fileInfo.Extension;
                                document["TextContent"] = GetPdfText(fileInfo.FullName, tracker);
                                document["DirectoryFullName"] = fileInfo.DirectoryName;
                                WriteDocument(writer, tracker, document, fileInfo.FullName);
                            }
                            catch (Exception ex)
                            {
                                tracker.LogInformation(string.Format("Failed getting file-info from '{0}'. Failed with exception: {1}", file, ex.Message));
                            }
                        }
                    }
                    tracker.LogInformation("--- Finished processing files ---");
                }
                else
                {
                    //check other actions and handle them
                }
            }
            catch (Exception ex)
            {
                tracker.Fail("Custom index builder experienced a fatal error: ", ex);
            }
        }

        private void WriteDocument(IIndexWriter writer, Tracker tracker, IndexDocument document, string filePath)
        {
            //allow extenders to process the index document
            foreach (var extender in Extenders)
            {
                extender.ExtendDocument(document);
            }
            //write to index
            writer.AddDocument(document);

            tracker.Status.Meta["CurrentFile"] = filePath;
            tracker.IncrementCounter();
        }

        private List<string> FileList(string dir, Tracker tracker)
        {
            // Prepare the final list of PDF files
            string[] files = Directory.GetFiles(dir, "*.pdf", SearchOption.AllDirectories);
            List<string> returnList = new List<string>();

            for (int i = 0; i < files.Length; i++)
            {
                try
                {
                    if (files[i].Length > 260)
                    {
                        tracker.LogInformation(string.Format("Length of full path to file exceeded 260 characters. File ignored: '{0}'", files[i].ToString()));
                    }
                    else
                    {
                        FileInfo fileInfo = new FileInfo(files[i].ToString());
                        if (fileInfo != null)
                            returnList.Add(files[i].ToString());
                    }
                }
                catch (Exception ex)
                {
                    tracker.LogInformation(string.Format("Preparing file list failed with the exception: '{0}'", ex.Message));
                }
            }
            return returnList;
        }

         private string GetPdfText(string InputFile, Tracker tracker)
        {
            string sOut = string.Empty;
            try
            {
                iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(InputFile);
                for (int i = 1; i < reader.NumberOfPages; i++)
                {
                    iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy tes = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
                    sOut += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, i, tes);
                }
            }
            catch (Exception ex)
            {
                tracker.LogInformation(string.Format("iTextSharper failed parsing PDF: '{0}'. Failed with exception: {1}", InputFile, ex.Message));
            }
            return sOut;
        }

        private string LinkToFile(string File)
        {
            // If not found in Globalsettings, just return the file and its path. You can still build a complete URL on the template.

            try
            {
                if (Domain == string.Empty)
                    return File;

                string file = File.Substring(File.IndexOf(@"\Files"));
                file = file.Replace(@"\", "/");
                string link = string.Format("https://{0}{1}", Domain, file);
                return link;
            }
            catch (Exception)
            {
                return File;
            }
        }
    }
}
Developer forum

Index PDF content

Replies