We recently experienced an issue with Arnold renders consistently failing on the cloud with the following traceback:
-- TRACEBACK BEGIN --
Traceback from karma 20.0.688 (Compiled on linux-x86_64-gcc9.3):
stackTrace(UTsignalHandlerArg) <libHoudiniUT.so>
signalCallback(UTsignalHandlerArg) <libHoudiniUT.so>
UT_Signal::UT_ComboSignalHandler::operator()(int, siginfo_t*, void*) const <libHoudiniUT.so>
UT_Signal::processSignal(int, siginfo_t*, void*) <libHoudiniUT.so>
__memcpy_ssse3_back <libc.so.6>
__memcpy_ssse3_back <libc.so.6>
std::vector<int, std::allocator<int> > pxrInternal_v0_23__pxrReserved__::Usd_CrateFile::CrateFile::_Reader<pxrInternal_v0_23__pxrReserved__::Usd_CrateFile::_PreadStream>::Read<int>(std::vector<int, std::allocator<int> >*) [clone .isra.0] <libpxr_usd.so>
std::vector<int, std::allocator<int> > pxrInternal_v0_23__pxrReserved__::Usd_CrateFile::CrateFile::_Reader<pxrInternal_v0_23__pxrReserved__::Usd_CrateFile::_PreadStream>::Read<int>(std::vector<int, std::allocator<int> >*) [clone .isra.0] <libpxr_usd.so>
std::vector<pxrInternal_v0_23__pxrReserved__::SdfPayload, std::allocator<pxrInternal_v0_23__pxrReserved__::SdfPayload> > pxrInternal_v0_23__pxrReserved__::Usd_CrateFile::CrateFile::_Reader<pxrInternal_v0_23__pxrReserved__::Usd_CrateFile::_MmapStream<pxrInternal_v0_23__pxrReserved__::Usd_CrateFile::CrateFile::_FileMapping*> >::Read<pxrInternal_v0_23__pxrReserved__::SdfPayload>(std::vector<pxrInternal_v0_23__pxrReserved__::SdfPayload, std::allocator<pxrInternal_v0_23__pxrReserved__::SdfPayload> >*) [clone .isra.0] <libpxr_usd.so>
pxrInternal_v0_23__pxrReserved__::Usd_CrateFile::CrateFile::_InitMMap() <libpxr_usd.so>
pxrInternal_v0_23__pxrReserved__::Usd_CrateFile::CrateFile::CrateFile(std::string const&, std::string const&, pxrInternal_v0_23__pxrReserved__::Usd_CrateFile::CrateFile::_FileMapping&&, std::shared_ptr<pxrInternal_v0_23__pxrReserved__::ArAsset> const&) <libpxr_usd.so>
pxrInternal_v0_23__pxrReserved__::Usd_CrateFile::CrateFile::Open(std::string const&, std::shared_ptr<pxrInternal_v0_23__pxrReserved__::ArAsset> const&, bool) <libpxr_usd.so>
pxrInternal_v0_23__pxrReserved__::Usd_CrateData::Open(std::string const&, std::shared_ptr<pxrInternal_v0_23__pxrReserved__::ArAsset> const&, bool) <libpxr_usd.so>
pxrInternal_v0_23__pxrReserved__::UsdUsdcFileFormat::_ReadFromAsset(pxrInternal_v0_23__pxrReserved__::SdfLayer*, std::string const&, std::shared_ptr<pxrInternal_v0_23__pxrReserved__::ArAsset> const&, bool, bool) const <libpxr_usd.so>
pxrInternal_v0_23__pxrReserved__::_Tf_RegistryFunction165(pxrInternal_v0_23__pxrReserved__::TfType*, void*) <libpxr_usd.so>
pxrInternal_v0_23__pxrReserved__::UsdUsdFileFormat::Read(pxrInternal_v0_23__pxrReserved__::SdfLayer*, std::string const&, bool) const <libpxr_usd.so>
pxrInternal_v0_23__pxrReserved__::SdfLayer::_Read(std::string const&, pxrInternal_v0_23__pxrReserved__::ArResolvedPath const&, bool) <libpxr_sdf.so>
pxrInternal_v0_23__pxrReserved__::SdfLayer::SetDetachedLayerRules(pxrInternal_v0_23__pxrReserved__::SdfLayer::DetachedLayerRules const&) <libpxr_sdf.so>
tbb::interface7::internal::delegated_function<pxrInternal_v0_23__pxrReserved__::SdfLayer::FindOrOpen(std::string const&, std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > const&)::{lambda()#1} const, pxrInternal_v0_23__pxrReserved__::TfRefPtr<pxrInternal_v0_23__pxrReserved__::SdfLayer> >::operator()() const <libpxr_sdf.so>
tbb::interface7::internal::isolate_within_arena(tbb::interface7::internal::delegate_base&, long) <libtbb.so.2>
pxrInternal_v0_23__pxrReserved__::TfRefPtr<pxrInternal_v0_23__pxrReserved__::SdfLayer> tbb::interface7::internal::isolate_impl<pxrInternal_v0_23__pxrReserved__::TfRefPtr<pxrInternal_v0_23__pxrReserved__::SdfLayer>, pxrInternal_v0_23__pxrReserved__::SdfLayer::FindOrOpen(std::string const&, std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > const&)::{lambda()#1} const>(pxrInternal_v0_23__pxrReserved__::SdfLayer::FindOrOpen(std::string const&, std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > const&)::{lambda()#1} const&) <libpxr_sdf.so>
pxrInternal_v0_23__pxrReserved__::SdfLayer::FindOrOpen(std::string const&, std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > const&) <libpxr_sdf.so>
pxrInternal_v0_23__pxrReserved__::PcpLayerStack::_BuildLayerStack(pxrInternal_v0_23__pxrReserved__::TfWeakPtr<pxrInternal_v0_23__pxrReserved__::SdfLayer> const&, pxrInternal_v0_23__pxrReserved__::SdfLayerOffset const&, double, pxrInternal_v0_23__pxrReserved__::ArResolverContext const&, std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > const&, std::string const&, pxrInternal_v0_23__pxrReserved__::Pcp_MutedLayers const&, std::set<pxrInternal_v0_23__pxrReserved__::TfWeakPtr<pxrInternal_v0_23__pxrReserved__::SdfLayer>, std::less<pxrInternal_v0_23__pxrReserved__::TfWeakPtr<pxrInternal_v0_23__pxrReserved__::SdfLayer> >, std::allocator<pxrInternal_v0_23__pxrReserved__::TfWeakPtr<pxrInternal_v0_23__pxrReserved__::SdfLayer> > >*, std::vector<std::shared_ptr<pxrInternal_v0_23__pxrReserved__::PcpErrorBase>, std::allocator<std::shared_ptr<pxrInternal_v0_23__pxrReserved__::PcpErrorBase> > >*)::{lambda(unsigned long)#1}::operator()(unsigned long) const <libpxr_pcp.so>
pxrInternal_v0_23__pxrReserved__::WorkDispatcher::_InvokerTask<pxrInternal_v0_23__pxrReserved__::PcpLayerStack::_BuildLayerStack(pxrInternal_v0_23__pxrReserved__::TfWeakPtr<pxrInternal_v0_23__pxrReserved__::SdfLayer> const&, pxrInternal_v0_23__pxrReserved__::SdfLayerOffset const&, double, pxrInternal_v0_23__pxrReserved__::ArResolverContext const&, std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > const&, std::string const&, pxrInternal_v0_23__pxrReserved__::Pcp_MutedLayers const&, std::set<pxrInternal_v0_23__pxrReserved__::TfWeakPtr<pxrInternal_v0_23__pxrReserved__::SdfLayer>, std::less<pxrInternal_v0_23__pxrReserved__::TfWeakPtr<pxrInternal_v0_23__pxrReserved__::SdfLayer> >, std::allocator<pxrInternal_v0_23__pxrReserved__::TfWeakPtr<pxrInternal_v0_23__pxrReserved__::SdfLayer> > >*, std::vector<std::shared_ptr<pxrInternal_v0_23__pxrReserved__::PcpErrorBase>, std::allocator<std::shared_ptr<pxrInternal_v0_23__pxrReserved__::PcpErrorBase> > >*)::{lambda(pxrInternal_v0_23__pxrReserved__::WorkDispatcher&)#2}::operator()(pxrInternal_v0_23__pxrReserved__::WorkDispatcher&) const::{lambda()#1}>::execute() <libpxr_pcp.so>
tbb::internal::custom_scheduler<tbb::internal::IntelSchedulerTraits>::process_bypass_loop(tbb::internal::context_guard_helper<false>&, tbb::task*, long) (custom_scheduler.h:474)
tbb::internal::custom_scheduler<tbb::internal::IntelSchedulerTraits>::local_wait_for_all(tbb::task&, tbb::task*) (custom_scheduler.h:636)
tbb::internal::arena::process(tbb::internal::generic_scheduler&) (arena.cpp:196)
tbb::internal::market::process(rml::job&) (market.cpp:667)
tbb::internal::rml::private_worker::run() (private_server.cpp:266)
tbb::internal::rml::private_worker::thread_routine(void*) (private_server.cpp:219)
start_thread <libpthread.so.0>
__clone <libc.so.6>
-- TRACEBACK END --
After much debugging we started suspecting it was related to IO timing from either IOPS and/or throughput exhaustion of the shared storage caching service (FSx), causing some read operation to not return fast enough. We spent some money with AWS to push the throughput capacity up in both and the issue resolved itself.
Nevertheless I still wanted to bring it up
- in case someone else runs into it.
- in case there’s anything that could be improved on the USD API side involved in Usd_CrateFile::CrateFile to better handle throughput delays.
- in case there’s anything we can do on our side, maybe in the way the USD data is organized, to help prevent this.