Running a C# .NET console app using HtmlAgilityPack that scrapes a webpage and loops appears to produce a memory leak. Where is the issue here?


When I run this C# console app code (using the HtmlAgilityPack open source project), memory usage starts to build up over time in what appears to be a memory leak, but I can’t track it down, despite doing forced Garbage Collection, reusing objects, setting things to null, and clearing out information.

Where is the flaw in my code here? Or is this an issue with HtmlAgilityPack?

Things just grind to a halt after 10 minutes of this loop running. And this is on a system with 128 GB of RAM.

using HtmlAgilityPack;


internal class Program
{
    private static int setNum = 16020;
    static HtmlDocument doc = null;
    static HtmlWeb web = new HtmlWeb();

    static void Main(string[] args)
    {
        ScrapeTypeYearName(setNum);

        Console.WriteLine("Press Enter to exit...");
        Console.ReadLine(); // Wait for the user to press Enter
    }

    public static void ScrapeTypeYearName(int setNum)
    {
        string setUrl = "https://www.tcdb.com/Checklist.cfm/sid/" + setNum;

        doc = web.Load(setUrl);

        // Process breadcrumb information
        string category = "";
        string year = "";
        string company = "";
        string setName = "";
        var breadcrumb = doc.DocumentNode.SelectSingleNode("//div[@class="d-none d-md-block"]/nav[@aria-label="breadcrumb"]");
        if (breadcrumb != null)
        {
            var breadcrumbItems = breadcrumb.SelectNodes("./ol/li/a");
            if (breadcrumbItems != null && breadcrumbItems.Count >= 4)
            {
                category = breadcrumbItems[1].InnerText.Trim();
                year = breadcrumbItems[3].InnerText.Trim();
                Console.WriteLine("Category: " + category);
                Console.WriteLine("Year: " + year);

                var companyNode = breadcrumb.SelectSingleNode("following::h1[@class="site"]");
                var setNode = breadcrumb.SelectSingleNode("following::h3[@class="site"]");
                if (companyNode != null)
                {
                    company = companyNode.InnerText.Trim();
                    company = company.Replace(year, "").Trim();
                    Console.WriteLine("Company: " + company);
                }

                if (setNode != null)
                {
                    setName = setNode.InnerText.Trim();
                    Console.WriteLine("Set: " + setName);
                }
            }
        } 

        // Do stuff here with the 

        // Clear the main document to free up memory
        doc?.DocumentNode.RemoveAll();

        setNum += 1;
        if (setNum < 99999)
        {
            Console.WriteLine($"");
            Console.WriteLine($"On set #: {setNum}");
            // Force garbage collection before the next recursive call
            GC.Collect();
            GC.WaitForPendingFinalizers();
            ScrapeTypeYearName(setNum);
        }
    }
}

Leave a Reply

Your email address will not be published. Required fields are marked *