Extract text and tables from PDF

The following example shows how you can read paragraphs and tables from a PDF file using the GemBox.Document component.

C#
VB.NET

using GemBox.Document;
using GemBox.Document.Tables;
using System;
using System.Linq;

class Program
{
    static void Main()
    {
        // If using the Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY");

        var document = DocumentModel.Load("%InputFileName%");

        // Get paragraphs.
        var paragraphs = document.GetChildElements(true, ElementType.Paragraph).Cast<Paragraph>();

        // Get tables.
        var tables = document.GetChildElements(true, ElementType.Table).Cast<Table>();

        // Display paragraphs and tables count.
        Console.WriteLine($"Paragraph count: {paragraphs.Count()}");
        Console.WriteLine($"Table count: {tables.Count()}");
        Console.WriteLine();

        // Display first paragraph's content.
        var paragraph = paragraphs.FirstOrDefault();
        if (paragraph != null)
        {
            Console.WriteLine("Paragraph content:");
            Console.WriteLine(paragraph.Content.ToString());
        }

        // Display last table's content.
        var table = tables.LastOrDefault();
        if (table != null)
        {
            Console.WriteLine("Table content:");
            foreach (var row in table.Rows)
            {
                foreach (var cell in row.Cells)
                    Console.Write($"{cell.Content.ToString().TrimEnd().PadRight(15)}|");
                Console.WriteLine();
            }
        }
    }
}

Imports GemBox.Document
Imports GemBox.Document.Tables
Imports System
Imports System.Linq

Module Program

    Sub Main()

        ' If using the Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY")

        Dim document = DocumentModel.Load("%InputFileName%")

        ' Get paragraphs.
        Dim paragraphs = document.GetChildElements(True, ElementType.Paragraph).Cast(Of Paragraph)()

        ' Get tables.
        Dim tables = document.GetChildElements(True, ElementType.Table).Cast(Of Table)()

        ' Display paragraphs and tables count.
        Console.WriteLine($"Paragraph count: {paragraphs.Count()}")
        Console.WriteLine($"Table count: {tables.Count()}")
        Console.WriteLine()

        ' Display first paragraph's content.
        Dim paragraph = paragraphs.FirstOrDefault()
        If paragraph IsNot Nothing Then
            Console.WriteLine("Paragraph content:")
            Console.WriteLine(paragraph.Content.ToString())
        End If

        ' Display last table's content.
        Dim table = tables.LastOrDefault()
        If paragraph IsNot Nothing Then
            Console.WriteLine("Table content:")
            For Each row In table.Rows
                For Each cell In row.Cells
                    Console.Write($"{cell.Content.ToString().TrimEnd().PadRight(15)}|")
                Next
                Console.WriteLine()
            Next
        End If

    End Sub
End Module

Reading PDF file and extracting its paragraphs and tables in C# and VB.NET — Screenshot of read text and table from input PDF file

If you don't need the logical structure, but instead want to know the exact position of the text (e.g., on which page and coordinates some text is located), take a look at this alternative approach for reading PDF text using GemBox.Pdf.

You can read more about how GemBox.Document detects a PDF's structure on the Support level for reading PDF format help page.

Next steps

GemBox.Document is a .NET component that enables you to read, write, edit, convert, and print document files from your .NET applications using one simple API. How about testing it today?

Download Buy

Extract text and tables from PDF

See also

Next steps