@inproceedings{1c07d43894dd41d18333b473321e532f,
title = "Comparing LSTM and Transformer for Video Depth Estimation",
abstract = "Accurate depth estimation from monocular video is critical for robotics applications such as simultaneous localization and mapping (SLAM) and navigation. Monocular depth estimation from video can be improved by incorporating temporal information across frames. The recently introduced sequence modeling techniques of recurrent long-short-term memory (LSTM) networks and Transformer architectures provide two potential approaches for aggregating temporal cues. This work presents a comparative study of using LSTM and Transformer modules for video depth prediction. The proposed depth pipeline extracts optical flow features between frames and passes them to either an LSTM or Transformer encoder before decoding into a depth map prediction. Compared to LSTM, the Transformer{\textquoteright}s ability to capture long-range dependencies allows it to propagate information more effectively across long sequences. It is shown that the Transformer outperforms LSTM models by five- to sixfold in depth map estimation based on standard metrics. This analysis provides insights into the advantages of Transformer over recurrent LSTM models for aggregation of temporal signals in depth estimation and other similar sequence prediction tasks. The Transformer{\textquoteright}s ability in aggregating motion across sequences holds promise for more robust spatial perception.",
keywords = "Computer vision, Depth estimation, LSTM, Machine learning, Optical flow, Sequence modeling, Transformer",
author = "Rozhin Fani and Berke Gur",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Switzerland AG 2024.; 7th EAI International Conference on Robotics and Networks, ROSENET 2023 ; Conference date: 15-12-2023 Through 16-12-2023",
year = "2024",
doi = "10.1007/978-3-031-64495-5_7",
language = "English",
isbn = "9783031644948",
series = "EAI/Springer Innovations in Communication and Computing",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "89--99",
editor = "G{\"u}l, {{\"O}mer Melih} and Paolo Fiorini and Kadry, {Seifedine Nimer}",
booktitle = "7th EAI International Conference on Robotic Sensor Networks - EAI ROSENET 2023",
}