Extract text and tables from PDF in C# and VB.NET
When reading the text content of a PDF file, GemBox.Document will recognize the file's logical structure and represent it using Table
and Paragraph
elements. You can read more about how GemBox.Document detects a PDF's structure on the Support level for reading PDF format help page.
If you don't need the logical structure, but instead want to know the exact position of the text (e.g., on which page and coordinates some text is located), take a look at this alternative approach for reading PDF text using GemBox.Pdf.
The following example shows how you can read paragraphs and tables from a PDF file using GemBox.Document.
using GemBox.Document;
using GemBox.Document.Tables;
using System;
using System.Linq;
class Program
{
static void Main()
{
// If using the Professional version, put your serial key below.
ComponentInfo.SetLicense("FREE-LIMITED-KEY");
var document = DocumentModel.Load("%InputFileName%");
// Get paragraphs.
var paragraphs = document.GetChildElements(true, ElementType.Paragraph).Cast<Paragraph>();
// Get tables.
var tables = document.GetChildElements(true, ElementType.Table).Cast<Table>();
// Display paragraphs and tables count.
Console.WriteLine($"Paragraph count: {paragraphs.Count()}");
Console.WriteLine($"Table count: {tables.Count()}");
Console.WriteLine();
// Display first paragraph's content.
var paragraph = paragraphs.FirstOrDefault();
if (paragraph != null)
{
Console.WriteLine("Paragraph content:");
Console.WriteLine(paragraph.Content.ToString());
}
// Display last table's content.
var table = tables.LastOrDefault();
if (table != null)
{
Console.WriteLine("Table content:");
foreach (var row in table.Rows)
{
foreach (var cell in row.Cells)
Console.Write($"{cell.Content.ToString().TrimEnd().PadRight(15)}|");
Console.WriteLine();
}
}
}
}
Imports GemBox.Document
Imports GemBox.Document.Tables
Imports System
Imports System.Linq
Module Program
Sub Main()
' If using the Professional version, put your serial key below.
ComponentInfo.SetLicense("FREE-LIMITED-KEY")
Dim document = DocumentModel.Load("%InputFileName%")
' Get paragraphs.
Dim paragraphs = document.GetChildElements(True, ElementType.Paragraph).Cast(Of Paragraph)()
' Get tables.
Dim tables = document.GetChildElements(True, ElementType.Table).Cast(Of Table)()
' Display paragraphs and tables count.
Console.WriteLine($"Paragraph count: {paragraphs.Count()}")
Console.WriteLine($"Table count: {tables.Count()}")
Console.WriteLine()
' Display first paragraph's content.
Dim paragraph = paragraphs.FirstOrDefault()
If paragraph IsNot Nothing Then
Console.WriteLine("Paragraph content:")
Console.WriteLine(paragraph.Content.ToString())
End If
' Display last table's content.
Dim table = tables.LastOrDefault()
If paragraph IsNot Nothing Then
Console.WriteLine("Table content:")
For Each row In table.Rows
For Each cell In row.Cells
Console.Write($"{cell.Content.ToString().TrimEnd().PadRight(15)}|")
Next
Console.WriteLine()
Next
End If
End Sub
End Module