c++ - Instancing with OpenGL 3.3 seems very slow -


i wrote minimal code-sample in c++, rendering 10000 colored quads on screen. using "instancing" , updating model-matrix each quad each frame. data of 6 vertices stored in indivdual vbo und reused time. projection-matrix (orthographic) injected once @ program-start via uniform. model-matrix calculated on cpu library glm. measured rendering-time , got average fps of 52. think less, cannot find mistake/bottleneck in little sample program.

after analysis seems, 3 calculations done glm slow. doing wrong here? example, if remove rotating-calculation, fps-boost of 10 fps! maybe can me find out, can better here , how can optimize sample. important me, each quad individual configurable during runtime, decided use instancing. moving matrix-calculations gpu seems option, confused, why cpu has problems calculating 10000 model-matrices! ok, cpu bad (athlon 2 core-duo m300, gpu ati mobility radeon 4100), should task in no measurable time, or?

here minimal, working, compilable example (if u have glfw , glm). maybe have time , can me out here :)

#define glew_static #define glm_force_inline #define glm_force_sse2 #include "glew.h" #include "glfw3.h" #include "glm.hpp" #include "glm/gtc/matrix_transform.hpp" #include <conio.h> #include <cstdlib> #include <iostream> #include <ctime>  gluint buildshader() {     std::string strvscode =      "#version 330 core\n"     "in vec3 vertexposition;\n"     "in mat4 modelmatrix;\n"     "uniform mat4 projectionmatrix;\n"     "out vec4 m_color;\n"     "void main() {\n"     "   vec4 vecvertex = vec4(vertexposition, 1);\n"     "   gl_position = projectionmatrix * modelmatrix * vecvertex;\n"     "   m_color = gl_position;\n"     "}\n";      std::string strfscode = "#version 330 core\n"     "out vec4 frag_colour;\n"     "in vec4 m_color;\n"     "void main() {\n"     "   frag_colour = vec4(m_color.x, m_color.y, m_color.z, 0.5f);\n"     "}\n";      gluint gluivertexshaderid = glcreateshader(gl_vertex_shader);     char const * vertexsourcepointer = strvscode.c_str();     glshadersource(gluivertexshaderid, 1, &vertexsourcepointer, null);     glcompileshader(gluivertexshaderid);     gluint gluifragmentshaderid = glcreateshader(gl_fragment_shader);     char const * fragmentsourcepointer = strfscode.c_str();     glshadersource(gluifragmentshaderid, 1, &fragmentsourcepointer, null);     glcompileshader(gluifragmentshaderid);     gluint gluiprogramid = glcreateprogram();     glattachshader(gluiprogramid, gluivertexshaderid);     glattachshader(gluiprogramid, gluifragmentshaderid);     gllinkprogram(gluiprogramid);     gldeleteshader(gluivertexshaderid);     gldeleteshader(gluifragmentshaderid);     return gluiprogramid; }  struct sprite {     glm::vec3 position, dimension;     float speed, rotation, rx, ry; };  struct vertex {     float x, y, z;     vertex(){};     vertex(float x, float y, float z) : x(x), y(y), z(z) {} };  int main(int arc, char **argv) {     // glfw init     int displayreswith   = 1366; //modify here     int displayresheight = 768;  //modify here     glfwinit();     glfwwindowhint(glfw_context_version_major, 3);     glfwwindowhint(glfw_context_version_minor, 3);     glfwwindowhint(glfw_opengl_forward_compat, 1);     glfwwindowhint(glfw_opengl_profile, glfw_opengl_core_profile);     glfwwindowhint(glfw_red_bits, 8);     glfwwindowhint(glfw_green_bits, 8);     glfwwindowhint(glfw_blue_bits, 8);     glfwwindowhint(glfw_alpha_bits, 8);     glfwwindowhint(glfw_depth_bits, 32);     glfwwindowhint(glfw_stencil_bits, 32);     glfwwindow* window = glfwcreatewindow(displayreswith, displayresheight,"instancing", glfwgetprimarymonitor(),null);     int width, height;     glfwmakecontextcurrent(window);     glfwswapinterval(0);     glfwgetframebuffersize(window, &width, &height);      //glew init     glewexperimental = gl_true;     glewinit();     const glubyte* renderer = glgetstring(gl_renderer);     const glubyte* version = glgetstring(gl_version);     std::cout << "renderer: " << renderer << std::endl;     std::cout << "opengl supported version: " << version << std::endl;      //opengl init     glenable(gl_cull_face);      glcullface(gl_back);     glenable(gl_depth_test);     gldepthfunc(gl_less);     glenable(gl_blend);     glblendfunc(gl_src_alpha, gl_one_minus_src_alpha);     glclearcolor(255.0f, 255.0f, 255.0f, 255.0f);      //shader     gluint programid = buildshader();      //vbo vertexbuffer     gluint vertexbuffer;     glgenbuffers(1, &vertexbuffer);     glbindbuffer(gl_array_buffer, vertexbuffer);     vertex vertexbufferdata[6];     vertexbufferdata[0] = vertex(-0.5f, 0.5f, 0.0f);    //links oben     vertexbufferdata[1] = vertex(-0.5f, -0.5f, 0.0f);   //links unten     vertexbufferdata[2] = vertex(0.5f, -0.5f, 0.0f);    //rechts unten     vertexbufferdata[3] = vertexbufferdata[2];          //rechts unten     vertexbufferdata[4] = vertex(0.5f, 0.5f, 0.0f);     //rechts oben     vertexbufferdata[5] = vertexbufferdata[0];          //links oben     glbufferdata(gl_array_buffer, sizeof(vertex)*6, vertexbufferdata, gl_static_draw);      //vbo instancebuffer     gluint instancebuffer;     glgenbuffers(1, &instancebuffer);     glbindbuffer(gl_array_buffer, instancebuffer);     int imaxinstancecount = 30000;     glm::mat4 *ptrinstancebufferdata = new glm::mat4[imaxinstancecount];     glbufferdata(gl_array_buffer, imaxinstancecount * sizeof(glm::mat4), null, gl_stream_draw);      //vao - start     gluint vertexarrayobject;     glgenvertexarrays(1, &vertexarrayobject);     glbindvertexarray(vertexarrayobject);          //for vbo vertexbuffer         glenablevertexattribarray(glgetattriblocation(programid, "vertexposition"));         glbindbuffer(gl_array_buffer, vertexbuffer);         glvertexattribpointer(             glgetattriblocation(programid, "vertexposition"),             3,                                                               gl_float,                                                        gl_false,                                                        sizeof(vertex),                                                  (void*)0                                                         );          glvertexattribdivisor(0, 0);          //for vbo instancebuffer         int pos = glgetattriblocation(programid, "modelmatrix");         int pos1 = pos + 0;         int pos2 = pos + 1;         int pos3 = pos + 2;         int pos4 = pos + 3;         glenablevertexattribarray(pos1);         glenablevertexattribarray(pos2);         glenablevertexattribarray(pos3);         glenablevertexattribarray(pos4);         glbindbuffer(gl_array_buffer, instancebuffer);         glvertexattribpointer(pos1, 4, gl_float, gl_false, sizeof(glfloat) * 4 * 4, (void*)(0));         glvertexattribpointer(pos2, 4, gl_float, gl_false, sizeof(glfloat) * 4 * 4, (void*)(sizeof(float) * 4));         glvertexattribpointer(pos3, 4, gl_float, gl_false, sizeof(glfloat) * 4 * 4, (void*)(sizeof(float) * 8));         glvertexattribpointer(pos4, 4, gl_float, gl_false, sizeof(glfloat) * 4 * 4, (void*)(sizeof(float) * 12));         glvertexattribdivisor(pos1, 1);         glvertexattribdivisor(pos2, 1);         glvertexattribdivisor(pos3, 1);         glvertexattribdivisor(pos4, 1);      glbindvertexarray(0); //vao - end      //matrix vars     glm::mat4 projection, rotating, scaling, translation, identity;     glm::vec3 zrotatevec(0.0f, 0.0f, 1.0f);      //calc projection-matrix , put shader (uniform)     projection = glm::ortho(0.0f, (float)width, 0.0f, (float)height, 0.0f, 1.0f);     gluseprogram(programid);     gluniformmatrix4fv(glgetuniformlocation(programid, "projectionmatrix"), 1, gl_false, &projection[0][0]);      //creating sprites     std::srand(static_cast<unsigned int>(std::time(0)));     int iactinstancecount = 10000;     sprite *ptrsprites = new sprite[iactinstancecount];     (int = 0; < iactinstancecount; ++i)     {         ptrsprites[i].dimension = glm::vec3(16, 16, 1.0f);         ptrsprites[i].position = glm::vec3(std::rand()%(width-32),std::rand()%(height-32),-1.0f *((std::rand()%256)/256.0f));         ptrsprites[i].rotation = rand() % 360 + 0.0f;         ptrsprites[i].rx = static_cast<float>(std::rand() % 2);         ptrsprites[i].ry = static_cast<float>(std::rand() % 2);         ptrsprites[i].speed = (std::rand() % 100) + 1.0f;         if (ptrsprites[i].speed < 1.0f) ptrsprites[i].speed = 1.0f;     }      //fps init     double fframesrendered = 0.0f;     double fframemeasurementstart = 0.0f;     double ffps = 0.0f;     double fcurrenttime = 0.0f;     glfwsettime(0);      //main-loop (also renderloop)     while (!glfwwindowshouldclose(window))     {         //application-logic         if (glfwgetkey(window, glfw_key_escape)== glfw_press)             glfwsetwindowshouldclose(window, gl_true);          const double fnewtime = glfwgettime();         double fdeltatime = fnewtime - fcurrenttime;         fcurrenttime = fnewtime;          (int = 0; < iactinstancecount; ++i)         {             float fspeed = ptrsprites[i].speed * static_cast<float>(fdeltatime);             ptrsprites[i].rotation += fspeed;             if (ptrsprites[i].rotation >= 360.0f) ptrsprites[i].rotation = 0.0f;             if (ptrsprites[i].rx == 1)  ptrsprites[i].position.x = ptrsprites[i].position.x + fspeed;             if (ptrsprites[i].rx == 0)  ptrsprites[i].position.x = ptrsprites[i].position.x - fspeed;             if (ptrsprites[i].ry == 1)  ptrsprites[i].position.y = ptrsprites[i].position.y + fspeed;             if (ptrsprites[i].ry == 0)  ptrsprites[i].position.y = ptrsprites[i].position.y - fspeed;             if (ptrsprites[i].position.x <= 0) ptrsprites[i].rx = 1;             if (ptrsprites[i].position.x + ptrsprites[i].dimension.x >= width) ptrsprites[i].rx = 0;             if (ptrsprites[i].position.y <= 0) ptrsprites[i].ry = 1;             if (ptrsprites[i].position.y + ptrsprites[i].dimension.y >= height) ptrsprites[i].ry = 0;              //matrix-calculations (saved in local buffer)             translation = glm::translate(identity, ptrsprites[i].position + glm::vec3(ptrsprites[i].dimension.x / 2.0f, ptrsprites[i].dimension.y / 2.0f, 0.0f));             scaling = glm::scale(translation, ptrsprites[i].dimension);             ptrinstancebufferdata[i] = glm::rotate(scaling, ptrsprites[i].rotation, zrotatevec);         }          //render-call         glclear(gl_color_buffer_bit | gl_depth_buffer_bit);         gluseprogram(programid);         glbindvertexarray(vertexarrayobject);         glbindbuffer(gl_array_buffer, instancebuffer);         glbufferdata(gl_array_buffer, imaxinstancecount * sizeof(glm::mat4), null, gl_stream_draw); // buffer orphaning         glbuffersubdata(gl_array_buffer, 0, iactinstancecount * sizeof(glm::mat4), ptrinstancebufferdata);         gldrawarraysinstanced(gl_triangles, 0, 6, iactinstancecount);         glbindvertexarray(0);         glfwswapbuffers(window);         glfwpollevents();           //fps-stuff         ++fframesrendered;          if ((fcurrenttime*1000.0f) >= (fframemeasurementstart*1000.0f) + 1000.0f)         {             ffps = ((fcurrenttime*1000.0f) - (fframemeasurementstart*1000.0f)) / 1000.0f * fframesrendered;             fframemeasurementstart = fcurrenttime;             fframesrendered = 0;             std::cout << "fps: " << ffps << std::endl;         }     }      //termination , cleanup     gldeletebuffers(1, &vertexbuffer);     gldeletebuffers(1, &instancebuffer);     gldeletevertexarrays(1, &vertexarrayobject);     gldeleteprogram(programid);     glfwdestroywindow(window);     glfwterminate();     return _getch(); } 

well, after testing on machine, cpu limited, nothing ogl going make difference. ~300fps gcc on @ least -o1, ~80 -o0. cpu fast (i7 2600k, 4.7ghz), gpu rather slow (gt 520). i'm on ubuntu.

some quick ideas things might speed little:

  • put vertex positions in array in vertex shader , use gl_vertexid access them
  • use gl_triangle_strip instead of gl_triangles
  • use radians angles, otherwise glm has convert them

none of these make of impact, really. make sure compiler set right, , there isn't more do.


Comments