I just came to think that building an index of PDF content is part of the T3 training. It's rather new, so I thought I'd just share it, in case you won't be visiting that part of our doc site.
I found that the AbcPDF did a quite poor job, especially by not cleaning up after itself, so I ended up using the iTextSharp library:
using Dynamicweb.Diagnostics.Tracking;
using Dynamicweb.Indexing.Schemas;
using Dynamicweb.Configuration;
using Dynamicweb.Content.Files;
using System;
using System.IO;
using System.Linq;
using System.Collections.Generic;
using iTextSharp;
namespace Dynamicweb.Indexing
{
public class CustomPDFIndexBuilder : IndexBuilderBase
{
// No http context available - getting domain from custom setting. Used for building complete link to file.
private string Domain = SystemConfiguration.Instance.GetValue("/Globalsettings/Settings/CustomPDFFileIndexer/Domain"); // your-domain.com
private string StartFolder = FilesAndFolders.GetFilesFolderName();
/// <summary>
/// List of supported actions
/// </summary>
public override IEnumerable<string> SupportedActions
{
get
{
return new string[] { "Full", "Update" };
}
}
/// <summary>
/// Gets default settings collection
/// </summary>
public override IDictionary<string, object> DefaultSettings
{
get { return new Dictionary<string, object> { { "StartFolder", StartFolder }, { "Domain", Domain } }; }
}
/// <summary>
/// Default constructor
/// </summary>
public CustomPDFIndexBuilder()
{
Action = "Full";
Settings = new Dictionary<string, string>();
}
/// <summary>
/// Creates new object using settings data
/// </summary>
/// <param name="settings"></param>
public CustomPDFIndexBuilder(IDictionary<string, string> settings)
{
Action = "Full";
Settings = settings;
}
/// <summary>
/// Gets index builder fields
/// </summary>
/// <returns>Set of key-value pairs</returns>
public override IEnumerable<FieldDefinitionBase> GetFields()
{
FileIndexSchemaExtender extender = new FileIndexSchemaExtender();
var schemaExtenderFields = extender.GetFields() as List<FieldDefinitionBase>;
// Add your custom fields
if (schemaExtenderFields != null)
{
schemaExtenderFields.Add(new FieldDefinition() { Name = "Text Content", SystemName = "TextContent", Source = "TextContent", TypeName = "System.String", Group = "PDF Specific", Indexed = true, Analyzed = false, Stored = true });
schemaExtenderFields.Add(new FieldDefinition() { Name = "Link to file", SystemName = "LinktToFile", Source = "LinkToFile", TypeName = "System.String", Group = "PDF Specific", Indexed = true, Analyzed = false, Stored = true });
}
return schemaExtenderFields;
}
/// <summary>
/// Builds current sql index
/// </summary>
/// <param name="writer"></param>
/// <param name="tracker"></param>
public override void Build(IIndexWriter writer, Tracker tracker)
{
string directory = string.Empty;
tracker.LogInformation("{0} building using {1}", GetType().FullName, writer.GetType().FullName);
try
{
tracker.LogInformation("Opening index writer.");
writer.Open(false);
tracker.LogInformation("Opened index writer to overwrite index");
//load builder settings
if (Settings.ContainsKey("StartFolder"))
StartFolder = Settings["StartFolder"];
if (Settings.ContainsKey("Domain"))
Domain = Settings["Domain"];
tracker.LogInformation("StartFolder: '{0}'", StartFolder);
tracker.LogInformation("Domain: '{0}'", Domain);
if (Action.Equals("Full"))
{
//process files
tracker.LogInformation("Starting processing files.");
directory = Core.SystemInformation.MapPath("/Files/") + "\\" + StartFolder.Trim(new char[] { '/', '\\' });
if (Directory.Exists(directory))
{
List<string> fileList = FileList(directory, tracker);
tracker.Status.TotalCount = fileList.Count();
foreach (string file in fileList)
{
try
{
FileInfo fileInfo = new FileInfo(file);
IndexDocument document = new IndexDocument();
document["FileName"] = fileInfo.Name;
document["FileFullName"] = fileInfo.FullName;
document["LinkToFile"] = LinkToFile(fileInfo.FullName);
document["Extension"] = fileInfo.Extension;
document["TextContent"] = GetPdfText(fileInfo.FullName, tracker);
document["DirectoryFullName"] = fileInfo.DirectoryName;
WriteDocument(writer, tracker, document, fileInfo.FullName);
}
catch (Exception ex)
{
tracker.LogInformation(string.Format("Failed getting file-info from '{0}'. Failed with exception: {1}", file, ex.Message));
}
}
}
tracker.LogInformation("--- Finished processing files ---");
}
else
{
//check other actions and handle them
}
}
catch (Exception ex)
{
tracker.Fail("Custom index builder experienced a fatal error: ", ex);
}
}
private void WriteDocument(IIndexWriter writer, Tracker tracker, IndexDocument document, string filePath)
{
//allow extenders to process the index document
foreach (var extender in Extenders)
{
extender.ExtendDocument(document);
}
//write to index
writer.AddDocument(document);
tracker.Status.Meta["CurrentFile"] = filePath;
tracker.IncrementCounter();
}
private List<string> FileList(string dir, Tracker tracker)
{
// Prepare the final list of PDF files
string[] files = Directory.GetFiles(dir, "*.pdf", SearchOption.AllDirectories);
List<string> returnList = new List<string>();
for (int i = 0; i < files.Length; i++)
{
try
{
if (files[i].Length > 260)
{
tracker.LogInformation(string.Format("Length of full path to file exceeded 260 characters. File ignored: '{0}'", files[i].ToString()));
}
else
{
FileInfo fileInfo = new FileInfo(files[i].ToString());
if (fileInfo != null)
returnList.Add(files[i].ToString());
}
}
catch (Exception ex)
{
tracker.LogInformation(string.Format("Preparing file list failed with the exception: '{0}'", ex.Message));
}
}
return returnList;
}
private string GetPdfText(string InputFile, Tracker tracker)
{
string sOut = string.Empty;
try
{
iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(InputFile);
for (int i = 1; i < reader.NumberOfPages; i++)
{
iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy tes = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
sOut += iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, i, tes);
}
}
catch (Exception ex)
{
tracker.LogInformation(string.Format("iTextSharper failed parsing PDF: '{0}'. Failed with exception: {1}", InputFile, ex.Message));
}
return sOut;
}
private string LinkToFile(string File)
{
// If not found in Globalsettings, just return the file and its path. You can still build a complete URL on the template.
try
{
if (Domain == string.Empty)
return File;
string file = File.Substring(File.IndexOf(@"\Files"));
file = file.Replace(@"\", "/");
string link = string.Format("https://{0}{1}", Domain, file);
return link;
}
catch (Exception)
{
return File;
}
}
}
}