1 year ago

#379663

test-img

Matty Muir

Seemingly unrelated use of std::thread causing later OpenGL commands to run slowly

I'm writing an application in C++ using OpenGL with GLFW, and GLEW for graphics. Each frame, the program completes some computations which are quite expensive time-wise, before displaying the calculated data to the screen in a visual manner. In order to speed up the prior calculations I used multithreading with std::thread to offload the work, before joining all the threads and collecting the data in a single vertex buffer, and drawing it to the screen.

However, after doing this I found that the rendering became very jittery and laggy. I've managed to reproduce this effect in a minimal example:

#include <GL/glew.h>
#include <GLFW/glfw3.h>

#include <iostream>
#include <vector>
#include <chrono>
#include <thread>

void framebuffer_size_callback(GLFWwindow* window, int width, int height);

// settings
const unsigned int SCR_WIDTH = 800;
const unsigned int SCR_HEIGHT = 600;

const char* vertexShaderSource = "#version 330 core\n"
"layout (location = 0) in vec3 aPos;\n"
"void main()\n"
"{\n"
"   gl_Position = vec4(aPos.x, aPos.y, aPos.z, 1.0);\n"
"}\0";
const char* fragmentShaderSource = "#version 330 core\n"
"out vec4 FragColor;\n"
"void main()\n"
"{\n"
"   FragColor = vec4(1.0f, 0.5f, 0.2f, 1.0f);\n"
"}\n\0";

void DoNothing()
{
    // I do nothing
}

int main()
{
    glfwInit();
    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);

    GLFWwindow* window = glfwCreateWindow(SCR_WIDTH, SCR_HEIGHT, "LearnOpenGL", NULL, NULL);
    if (window == NULL)
    {
        std::cout << "Failed to create GLFW window" << std::endl;
        glfwTerminate();
        return -1;
    }
    glfwMakeContextCurrent(window);
    glfwSetFramebufferSizeCallback(window, framebuffer_size_callback);

    glewInit();
    
    glfwSwapInterval(0); // Disable VSync

    unsigned int vertexShader = glCreateShader(GL_VERTEX_SHADER);
    glShaderSource(vertexShader, 1, &vertexShaderSource, NULL);
    glCompileShader(vertexShader);

    int success;
    char infoLog[512];
    glGetShaderiv(vertexShader, GL_COMPILE_STATUS, &success);
    if (!success)
    {
        glGetShaderInfoLog(vertexShader, 512, NULL, infoLog);
        std::cout << "ERROR::SHADER::VERTEX::COMPILATION_FAILED\n" << infoLog << std::endl;
    }

    unsigned int fragmentShader = glCreateShader(GL_FRAGMENT_SHADER);
    glShaderSource(fragmentShader, 1, &fragmentShaderSource, NULL);
    glCompileShader(fragmentShader);

    glGetShaderiv(fragmentShader, GL_COMPILE_STATUS, &success);
    if (!success)
    {
        glGetShaderInfoLog(fragmentShader, 512, NULL, infoLog);
        std::cout << "ERROR::SHADER::FRAGMENT::COMPILATION_FAILED\n" << infoLog << std::endl;
    }

    unsigned int shaderProgram = glCreateProgram();
    glAttachShader(shaderProgram, vertexShader);
    glAttachShader(shaderProgram, fragmentShader);
    glLinkProgram(shaderProgram);

    glGetProgramiv(shaderProgram, GL_LINK_STATUS, &success);
    if (!success) {
        glGetProgramInfoLog(shaderProgram, 512, NULL, infoLog);
        std::cout << "ERROR::SHADER::PROGRAM::LINKING_FAILED\n" << infoLog << std::endl;
    }
    glDeleteShader(vertexShader);
    glDeleteShader(fragmentShader);

    glUseProgram(shaderProgram);

    float vertices[] = {
        -0.5f, -0.5f, 0.0f, // left  
         0.5f, -0.5f, 0.0f, // right 
         0.0f,  0.5f, 0.0f  // top   
    };

    unsigned int VBO, VAO;
    glGenVertexArrays(1, &VAO);
    glGenBuffers(1, &VBO);

    glBindVertexArray(VAO);

    glBindBuffer(GL_ARRAY_BUFFER, VBO);
    glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);

    glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float), (void*)0);
    glEnableVertexAttribArray(0);

    glBindBuffer(GL_ARRAY_BUFFER, 0);

    while (!glfwWindowShouldClose(window))
    {
        glClearColor(0.2f, 0.3f, 0.3f, 1.0f);
        glClear(GL_COLOR_BUFFER_BIT);

        // ===== Initialize Some Threads to do Nothing =====
        int nthreads = 1;
        std::vector<std::thread> threads;
        threads.reserve(nthreads);
        for (int threadIndex = 0; threadIndex < nthreads; threadIndex++)
            threads.emplace_back(DoNothing);
        for (std::thread& t : threads)
            t.join();
        // =================================================

        glFinish(); // Ensure OpenGL is totally finished before starting the timer

        auto startTime = std::chrono::high_resolution_clock::now();
        glDrawArrays(GL_TRIANGLES, 0, 3);
        glfwSwapBuffers(window);
        glfwPollEvents();
        glFinish(); // Make sure all commands are finished before stopping timer
        auto endTime = std::chrono::high_resolution_clock::now();

        auto startMicro = std::chrono::time_point_cast<std::chrono::microseconds>(startTime).time_since_epoch().count();
        auto endMicro = std::chrono::time_point_cast<std::chrono::microseconds>(endTime).time_since_epoch().count();

        long long int duration = endMicro - startMicro;
        std::cout << "Took: " << duration * 0.001 << "ms\n";
    }

    glDeleteVertexArrays(1, &VAO);
    glDeleteBuffers(1, &VBO);
    glDeleteProgram(shaderProgram);

    glfwTerminate();
    return 0;
}

void framebuffer_size_callback(GLFWwindow* window, int width, int height)
{
    glViewport(0, 0, width, height);
}

This code is copied almost directly from the minimal sample on learnopengl.com. The only significant change is to the main render loop:

while (!glfwWindowShouldClose(window))
    {
        glClearColor(0.2f, 0.3f, 0.3f, 1.0f);
        glClear(GL_COLOR_BUFFER_BIT);

        // ===== Initialize Some Threads to do Nothing =====
        int nthreads = 1;
        std::vector<std::thread> threads;
        threads.reserve(nthreads);
        for (int threadIndex = 0; threadIndex < nthreads; threadIndex++)
            threads.emplace_back(DoNothing);
        for (std::thread& t : threads)
            t.join();
        // =================================================

        glFinish(); // Ensure OpenGL is totally finished before starting the timer

        auto startTime = std::chrono::high_resolution_clock::now();
        glDrawArrays(GL_TRIANGLES, 0, 3);
        glfwSwapBuffers(window);
        glfwPollEvents();
        glFinish(); // Make sure all commands are finished before stopping timer
        auto endTime = std::chrono::high_resolution_clock::now();

        auto startMicro = std::chrono::time_point_cast<std::chrono::microseconds>(startTime).time_since_epoch().count();
        auto endMicro = std::chrono::time_point_cast<std::chrono::microseconds>(endTime).time_since_epoch().count();

        long long int duration = endMicro - startMicro;
        std::cout << "Took: " << duration * 0.001 << "ms\n";
    }

Where I initialize a single thread (since the variable 'nThreads' is set to 1) to 'DoNothing()' before joining it and continuing. However, the timings suffered a drastic change.

The timings with nThreads set to 0 (threading disabled) are as follows:

Took: 5.14ms Took: 1.011ms Took: 0.615ms Took: 0.883ms Took: 0.894ms Took: 0.847ms Took: 0.831ms Took: 0.477ms Took: 0.456ms Took: 0.874ms Took: 0.451ms Took: 0.46ms Took: 0.488ms Took: 1.323ms Took: 0.932ms Took: 0.887ms Took: 0.611ms Took: 0.623ms Took: 0.576ms Took: 2.17ms Took: 0.453ms Took: 0.348ms Took: 0.41ms Took: 0.566ms Took: 0.427ms Took: 0.366ms Took: 0.406ms Took: 1.143ms Took: 0.584ms Took: 1.192ms Took: 1.586ms Took: 0.945ms Took: 1.482ms Took: 0.69ms Took: 0.588ms Took: 0.856ms Took: 0.592ms Took: 0.533ms Took: 0.938ms Took: 0.71ms Took: 0.592ms Took: 0.54ms Took: 0.553ms Took: 0.521ms Took: 0.549ms

Whereas the timings with nThreads set to 1 were:

Took: 63.177ms Took: 0.983ms Took: 0.841ms Took: 1.004ms Took: 0.336ms Took: 92.499ms Took: 98.086ms Took: 93.81ms Took: 93.909ms Took: 106.853ms Took: 96.406ms Took: 102.337ms Took: 109.134ms Took: 104.278ms Took: 101.201ms Took: 102.328ms Took: 94.609ms Took: 92.916ms Took: 62.069ms Took: 98.112ms Took: 108.483ms Took: 101.531ms Took: 100.358ms Took: 100.211ms Took: 93.724ms Took: 99.032ms Took: 92.898ms Took: 97.362ms Took: 95.587ms Took: 93.366ms Took: 60.48ms Took: 0.438ms Took: 0.314ms Took: 0.296ms Took: 0.291ms Took: 0.289ms Took: 0.36ms Took: 0.327ms Took: 0.315ms Took: 0.293ms Took: 0.286ms Took: 0.546ms Took: 0.291ms Took: 0.289ms Took: 0.303ms Took: 0.295ms Took: 0.298ms Took: 0.301ms Took: 102.664ms Took: 97.894ms Took: 99.621ms Took: 60.743ms Took: 0.484ms Took: 0.348ms Took: 0.302ms Took: 0.295ms Took: 0.304ms Took: 0.558ms Took: 0.297ms Took: 0.306ms Took: 0.293ms Took: 0.291ms Took: 0.339ms Took: 0.301ms Took: 0.304ms Took: 0.379ms Took: 0.399ms Took: 0.3ms Took: 0.299ms Took: 0.295ms Took: 0.291ms Took: 0.322ms Took: 0.775ms Took: 0.307ms Took: 0.295ms Took: 0.294ms Took: 0.286ms Took: 0.308ms Took: 0.342ms Took: 0.454ms Took: 0.314ms

For some reason, initializing a single thread that runs no code is causing these times to be sporadic, sometimes they are just fine and yet sometimes they are ridiculously high. In my original application these large times are only exaggerated, and are causing a significant issue.

I appreciate any help people can offer me.

Code samples are compiled using MSVC2022, and run on an NVIDIA 1060 and an AMD Ryzen 5 1600X.

c++

opengl

optimization

concurrency

0 Answers

Your Answer

Accepted video resources