I just came to think that building an index of PDF content is part of the T3 training. It's rather new, so I thought I'd just share it, in case you won't be visiting that part of our doc site.
I found that the AbcPDF did a quite poor job, especially by not cleaning up after itself, so I ended up using the iTextSharp library:
using Dynamicweb.Diagnostics.Tracking; using Dynamicweb.Indexing.Schemas; using Dynamicweb.Configuration; using Dynamicweb.Content.Files; using System; using System.IO; using System.Linq; using System.Collections.Generic; using iTextSharp; namespace Dynamicweb.Indexing { public class CustomPDFIndexBuilder : IndexBuilderBase { // No http context available - getting domain from custom setting. Used for building complete link to file. private string Domain = SystemConfiguration.Instance.GetValue("/Globalsettings/Settings/CustomPDFFileIndexer/Domain"); // your-domain.com private string StartFolder = FilesAndFolders.GetFilesFolderName(); /// <summary> /// List of supported actions /// </summary> public override IEnumerable<string> SupportedActions { get { return new string[] { "Full", "Update" }; } } /// <summary> /// Gets default settings collection /// </summary> public override IDictionary<string, object> DefaultSettings { get { return new Dictionary<string, object> { { "StartFolder", StartFolder }, { "Domain", Domain } }; } } /// <summary> /// Default constructor /// </summary> public CustomPDFIndexBuilder() { Action = "Full"; Settings = new Dictionary<string, string>(); } /// <summary> /// Creates new object using settings data /// </summary> /// <param name="settings"></param> public CustomPDFIndexBuilder(IDictionary<string, string> settings) { Action = "Full"; Settings = settings; } /// <summary> /// Gets index builder fields /// </summary> /// <returns>Set of key-value pairs</returns> public override IEnumerable<FieldDefinitionBase> GetFields() { FileIndexSchemaExtender extender = new FileIndexSchemaExtender(); var schemaExtenderFields = extender.GetFields() as List<FieldDefinitionBase>; // Add your custom fields if (schemaExtenderFields != null) { schemaExtenderFields.Add(new FieldDefinition() { Name = "Text Content", SystemName = "TextContent", Source = "TextContent", TypeName = "System.String", Group = "PDF Specific", Indexed = true, Analyzed = false, Stored = true }); schemaExtenderFields.Add(new FieldDefinition() { Name = "Link to file", SystemName = "LinktToFile", Source = "LinkToFile", TypeName = "System.String", Group = "PDF Specific", Indexed = true, Analyzed = false, Stored = true }); } return schemaExtenderFields; } /// <summary> /// Builds current sql index /// </summary> /// <param name="writer"></param> /// <param name="tracker"></param> public override void Build(IIndexWriter writer, Tracker tracker) { string directory = string.Empty; tracker.LogInformation("{0} building using {1}", GetType().FullName, writer.GetType().FullName); try { tracker.LogInformation("Opening index writer."); writer.Open(false); tracker.LogInformation("Opened index writer to overwrite index"); //load builder settings if (Settings.ContainsKey("StartFolder")) StartFolder = Settings["StartFolder"]; if (Settings.ContainsKey("Domain")) Domain = Settings["Domain"]; tracker.LogInformation("StartFolder: '{0}'", StartFolder); tracker.LogInformation("Domain: '{0}'", Domain); if (Action.Equals("Full")) { //process files tracker.LogInformation("Starting processing files."); directory = Core.SystemInformation.MapPath("/Files/") + "\\" + StartFolder.Trim(new char[] { '/', '\\' }); if (Directory.Exists(directory)) { List<string> fileList = FileList(directory, tracker); tracker.Status.TotalCount = fileList.Count(); foreach (string file in fileList) { try { FileInfo fileInfo = new FileInfo(file); IndexDocument document = new IndexDocument(); document["FileName"] = fileInfo.Name; document["FileFullName"] = fileInfo.FullName; document["LinkToFile"] = LinkToFile(fileInfo.FullName); document["Extension"] = fileInfo.Extension; document["TextContent"] = GetPdfText(fileInfo.FullName, tracker); document["DirectoryFullName"] = fileInfo.DirectoryName; WriteDocument(writer, tracker, document, fileInfo.FullName); } catch (Exception ex) { tracker.LogInformation(string.Format("Failed getting file-info from '{0}'. Failed with exception: {1}", file, ex.Message)); } } } tracker.LogInformation("--- Finished processing files ---"); } else { //check other actions and handle them } } catch (Exception ex) { tracker.Fail("Custom index builder experienced a fatal error: ", ex); } } private void WriteDocument(IIndexWriter writer, Tracker tracker, IndexDocument document, string filePath) { //allow extenders to process the index document foreach (var extender in Extenders) { extender.ExtendDocument(document); } //write to index writer.AddDocument(document); tracker.Status.Meta["CurrentFile"] = filePath; tracker.IncrementCounter(); } private List<string> FileList(string dir, Tracker tracker) { // Prepare the final list of PDF files string[] files = Directory.GetFiles(dir, "*.pdf", SearchOption.AllDirectories); List<string> returnList = new List<string>(); for (int i = 0; i < files.Length; i++) { try { if (files[i].Length > 260) { tracker.LogInformation(string.Format("Length of full path to file exceeded 260 characters. File ignored: '{0}'", files[i].ToString())); } else { FileInfo fileInfo = new FileInfo(files[i].ToString()); if (fileInfo != null) returnList.Add(files[i].ToString()); } } catch (Exception ex) { tracker.LogInformation(string.Format("Preparing file list failed with the exception: '{0}'", ex.Message)); } } return returnList; } private string GetPdfText(string InputFile, Tracker tracker) { string sOut = string.Empty; try { iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(InputFile); for (int i = 1; i < reader.NumberOfPages; i++) { iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy tes = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy(); sOut += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, i, tes); } } catch (Exception ex) { tracker.LogInformation(string.Format("iTextSharper failed parsing PDF: '{0}'. Failed with exception: {1}", InputFile, ex.Message)); } return sOut; } private string LinkToFile(string File) { // If not found in Globalsettings, just return the file and its path. You can still build a complete URL on the template. try { if (Domain == string.Empty) return File; string file = File.Substring(File.IndexOf(@"\Files")); file = file.Replace(@"\", "/"); string link = string.Format("https://{0}{1}", Domain, file); return link; } catch (Exception) { return File; } } } }