From 70c6d40435f9ebde52b31873375c491a0fff3efa Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Sun, 8 Mar 2020 01:26:15 +0100
Subject: [PATCH 01/68] memory benchmark rss

---
 examples/benchmarks.py    | 76 ++++++++++++++++++++++++++++++++-------
 examples/requirements.txt |  1 +
 2 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 07de19d4b518..9342745fcee4 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -18,9 +18,11 @@
 # If checking the tensors placement
 # tf.debugging.set_log_device_placement(True)
 
+import os
 import argparse
 import csv
 import timeit
+import psutil
 from time import time
 from typing import List
 
@@ -248,8 +250,31 @@
 as they entered."""
 
 
+def get_memory_diff(func, memory_field="rss", repeat=1):
+    process = psutil.Process(os.getpid())
+    mem_diffs = []
+    for _ in range(repeat):
+        mi_before = process.memory_info()
+        output = func()
+        mi_after = process.memory_info()
+        mem_diff = getattr(mi_after, memory_field) - getattr(mi_before, memory_field)
+        mem_diffs.append(mem_diff)
+    output = sum(mem_diffs) / len(mem_diffs)
+    return output
+
+
+def memory_to_human_readable(memory_amount):
+    for unit in ['B','KB','MB','GB']:
+        if memory_amount > -1024.0 and memory_amount < 1024.0:
+            return f"{memory_amount:.3f}{unit}"
+        memory_amount /= 1024.0
+    return f"{memory_amount:.3f}TB"
+
+
 def create_setup_and_compute(
     model_names: List[str],
+    batch_sizes: List[int],
+    slice_sizes: List[int],
     gpu: bool = True,
     tensorflow: bool = False,
     average_over: int = 3,
@@ -259,6 +284,7 @@ def create_setup_and_compute(
     fp16: bool = False,
     save_to_csv: bool = False,
     csv_filename: str = f"results_{round(time())}.csv",
+    csv_memory_filename: str = f"memory_{round(time())}.csv",
 ):
     if xla:
         tf.config.optimizer.set_jit(True)
@@ -267,11 +293,11 @@ def create_setup_and_compute(
 
     if tensorflow:
         dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_tensorflow(model_names, dictionary, average_over, amp)
+        results = _compute_tensorflow(model_names, batch_sizes, slice_sizes, dictionary, average_over, amp)
     else:
         device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
         dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16)
+        results = _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16)
 
     print("=========== RESULTS ===========")
     for model_name in model_names:
@@ -280,13 +306,14 @@ def create_setup_and_compute(
             print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
             for slice_size in results[model_name]["ss"]:
                 result = results[model_name]["results"][batch_size][slice_size]
+                memory = results[model_name]["memory"][batch_size][slice_size]
                 if isinstance(result, str):
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result}")
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result} " f"{memory_to_human_readable(memory)}")
                 else:
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s")
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s "  f"{memory_to_human_readable(memory)}")
 
     if save_to_csv:
-        with open(csv_filename, mode="w") as csv_file:
+        with open(csv_filename, mode="w") as csv_file, open(csv_memory_filename, mode="w") as csv_memory_file:
             fieldnames = [
                 "model",
                 "1x8",
@@ -317,6 +344,8 @@ def create_setup_and_compute(
 
             writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
             writer.writeheader()
+            memory_writer = csv.DictWriter(csv_memory_file, fieldnames=fieldnames)
+            memory_writer.writeheader()
 
             for model_name in model_names:
                 model_results = {
@@ -326,8 +355,15 @@ def create_setup_and_compute(
                 }
                 writer.writerow({"model": model_name, **model_results})
 
+                model_memory_results = {
+                    f"{bs}x{ss}": results[model_name]["memory"][bs][ss]
+                    for bs in results[model_name]["memory"]
+                    for ss in results[model_name]["memory"][bs]
+                }
+                memory_writer.writerow({"model": model_name, **model_memory_results})
+
 
-def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
+def _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16):
     for c, model_name in enumerate(model_names):
         print(f"{c + 1} / {len(model_names)}")
         config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
@@ -337,11 +373,10 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript,
         tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
 
         max_input_size = tokenizer.max_model_input_sizes[model_name]
-        batch_sizes = [1, 2, 4, 8]
-        slice_sizes = [8, 64, 128, 256, 512, 1024]
 
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
+        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {}}
         dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
+        dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}
 
         for batch_size in batch_sizes:
             if fp16:
@@ -366,14 +401,19 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript,
                         runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                         average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                         dictionary[model_name]["results"][batch_size][slice_size] = average_time
+
+                        mem_diff = get_memory_diff(lambda: inference(sequence), repeat=5)
+                        dictionary[model_name]["memory"][batch_size][slice_size] = mem_diff
+
                     except RuntimeError as e:
                         print("Doesn't fit on GPU.", e)
                         torch.cuda.empty_cache()
                         dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                        dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
     return dictionary
 
 
-def _compute_tensorflow(model_names, dictionary, average_over, amp):
+def _compute_tensorflow(model_names, batch_sizes, slice_sizes, dictionary, average_over, amp):
     for c, model_name in enumerate(model_names):
         print(f"{c + 1} / {len(model_names)}")
         config = AutoConfig.from_pretrained(model_name)
@@ -383,11 +423,10 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp):
         tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
 
         max_input_size = tokenizer.max_model_input_sizes[model_name]
-        batch_sizes = [1, 2, 4, 8]
-        slice_sizes = [8, 64, 128, 256, 512, 1024]
 
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
+        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {}}
         dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
+        dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}
 
         print("Using model", model)
 
@@ -412,10 +451,15 @@ def inference(inputs):
                         runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                         average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                         dictionary[model_name]["results"][batch_size][slice_size] = average_time
+
+                        mem_diff = get_memory_diff(lambda: inference(sequence), repeat=3)
+                        dictionary[model_name]["memory"][batch_size][slice_size] = mem_diff
+
                     except tf.errors.ResourceExhaustedError as e:
                         print("Doesn't fit on GPU.", e)
                         torch.cuda.empty_cache()
                         dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                        dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
     return dictionary
 
 
@@ -477,6 +521,8 @@ def main():
     parser.add_argument(
         "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
     )
+    parser.add_argument('--batch_sizes', nargs='+', type=int, default=[1, 2, 4, 8])
+    parser.add_argument('--slice_sizes', nargs='+', type=int, default=[8, 64, 128, 256, 512, 1024])
 
     args = parser.parse_args()
     if args.models == "all":
@@ -501,6 +547,8 @@ def main():
         if is_torch_available():
             create_setup_and_compute(
                 model_names=args.models,
+                batch_sizes=args.batch_sizes,
+                slice_sizes=args.slice_sizes,
                 tensorflow=False,
                 gpu=args.torch_cuda,
                 torchscript=args.torchscript,
@@ -516,6 +564,8 @@ def main():
         if is_tf_available():
             create_setup_and_compute(
                 model_names=args.models,
+                batch_sizes=args.batch_sizes,
+                slice_sizes=args.slice_sizes,
                 tensorflow=True,
                 xla=args.xla,
                 amp=args.amp,
diff --git a/examples/requirements.txt b/examples/requirements.txt
index 36229755e818..6a4126c92638 100644
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -2,3 +2,4 @@ tensorboardX
 tensorboard
 scikit-learn
 seqeval
+psutil

From e0c50a67491b618bcf97fd2f1c9efef460f28225 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Tue, 10 Mar 2020 16:29:10 +0100
Subject: [PATCH 02/68] have both forward pass and line-by-line mem tracing

---
 examples/benchmarks.py             | 38 +++++++++++++++---
 src/transformers/__init__.py       |  2 +-
 src/transformers/modeling_utils.py | 64 ++++++++++++++++++++++++++++++
 3 files changed, 98 insertions(+), 6 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 9342745fcee4..81479f8c4fbb 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -26,7 +26,7 @@
 from time import time
 from typing import List
 
-from transformers import AutoConfig, AutoTokenizer, is_tf_available, is_torch_available
+from transformers import AutoConfig, AutoTokenizer, is_tf_available, is_torch_available, set_memory_tracing
 
 
 if is_tf_available():
@@ -251,6 +251,8 @@
 
 
 def get_memory_diff(func, memory_field="rss", repeat=1):
+    """ We include these inside the pytorch model butcan't do it for TF 2.0 so let's keep the method here for now.
+    """
     process = psutil.Process(os.getpid())
     mem_diffs = []
     for _ in range(repeat):
@@ -382,7 +384,11 @@ def _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_
             if fp16:
                 model.half()
             model.to(device)
-            model.eval()
+            model.train()
+
+            memory_list = set_memory_tracing('transformers')  # Line by line tracing for all code in the module `transformers`
+            model.add_memory_hooks()  # Forward methode tracing for a PyTorch model
+
             for slice_size in slice_sizes:
                 if max_input_size is not None and slice_size > max_input_size:
                     dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
@@ -397,13 +403,35 @@ def _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_
                             inference = model
                             inference(sequence)
 
+                        mem_diffs = []
+                        mem_diffs_set = {}
+                        for line1, line2 in zip(memory_list[:-1], memory_list[1:]):
+                            filename, name, line_no, event, line_text, mem1 = line1
+                            _, _, _, _, _, mem2 = line2
+                            mem_diff = mem2 - mem1
+                            mem_diffs.append((filename, line_no, line_text, mem_diff))
+                            if (filename, line_no, line_text) in mem_diffs_set:
+                                mem_diffs_set[(filename, line_no, line_text)] = mem_diffs_set[(filename, line_no, line_text)] + mem_diff
+                            else:
+                                mem_diffs_set[(filename, line_no, line_text)] = mem_diff
+                            mem_str = memory_to_human_readable(mem_diff)
+                            print(f"{filename}:{line_no}: mem {mem_str}: {line_text}")
+
+                        mem_diffs = sorted(list(mem_diffs_set.items()), key=lambda x: x[1], reverse=False)
+                        for (filename, line_no, line_text), mem_diff in mem_diffs[-10:]:
+                            mem_str = memory_to_human_readable(mem_diff)
+                            print(f"---- {filename}:{line_no}: mem {mem_str}: {line_text}")
+
+                        print('sum of all lines increase', memory_to_human_readable(sum(m[1] for m in mem_diffs)))
+                        print('forward pass increase', memory_to_human_readable(model.mem_rss_diff))
+
+                        exit()
+
                         print("Going through model with sequence of shape", sequence.shape)
                         runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                         average_time = sum(runtimes) / float(len(runtimes)) / 3.0
                         dictionary[model_name]["results"][batch_size][slice_size] = average_time
-
-                        mem_diff = get_memory_diff(lambda: inference(sequence), repeat=5)
-                        dictionary[model_name]["memory"][batch_size][slice_size] = mem_diff
+                        dictionary[model_name]["memory"][batch_size][slice_size] = model.mem_rss_diff
 
                     except RuntimeError as e:
                         print("Doesn't fit on GPU.", e)
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 16b83154251b..0840d19382c6 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -136,7 +136,7 @@
 
 # Modeling
 if is_torch_available():
-    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering
+    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering, set_memory_tracing
     from .modeling_auto import (
         AutoModel,
         AutoModelForPreTraining,
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 5a733fe758c3..5047fa5461a4 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -18,6 +18,9 @@
 import logging
 import os
 import typing
+import sys
+import psutil
+import linecache
 
 import torch
 from torch import nn
@@ -39,6 +42,36 @@
 
 logger = logging.getLogger(__name__)
 
+
+def set_memory_tracing(module_to_trace):
+    """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
+        See `../../examples/benchmarks.py for a usage example.
+    """
+    memory_list = []
+    process = psutil.Process(os.getpid())
+
+    def traceit(frame, event, arg):
+        """ Tracing method executed before running each line in a module or sub-module
+            Record memory allocated in a list with debugging information
+        """
+        name = frame.f_globals["__name__"]
+        if event != "line" or (not isinstance(name, str)) or (module_to_trace not in name):
+            return traceit
+
+        lineno = frame.f_lineno
+        filename = frame.f_globals["__file__"]
+        if (filename.endswith(".pyc") or
+            filename.endswith(".pyo")):
+            filename = filename[:-1]
+        line = linecache.getline(filename, lineno).rstrip()
+        mem = process.memory_info()
+        memory_list.append((filename, name, lineno, event, line, mem.rss))
+        return traceit
+
+    sys.settrace(traceit)
+    return memory_list
+
+
 try:
     from torch.nn import Identity
 except ImportError:
@@ -66,6 +99,37 @@ def num_parameters(self, only_trainable: bool = False) -> int:
         params = filter(lambda x: x.requires_grad, self.parameters()) if only_trainable else self.parameters()
         return sum(p.numel() for p in params)
 
+    @staticmethod
+    def _hook_rss_memory_pre_forward(module, *args, **kwargs):
+        process = psutil.Process(os.getpid())
+        mem = process.memory_info()
+        module.mem_rss_pre_forward = mem.rss
+        return None
+
+
+    @staticmethod
+    def _hook_rss_memory_post_forward(module, *args, **kwargs):
+        process = psutil.Process(os.getpid())
+        mem = process.memory_info()
+        module.mem_rss_post_forward = mem.rss
+        mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward
+        module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, 'mem_rss_diff') else 0)
+        return None
+
+    def add_memory_hooks(self):
+        """ Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.
+            Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero with `model.reset_memory_hooks_state()`
+        """
+        for module in self.modules():
+            module.register_forward_pre_hook(self._hook_rss_memory_pre_forward)
+            module.register_forward_hook(self._hook_rss_memory_post_forward)
+
+    def reset_memory_hooks_state(self):
+        for module in self.modules:
+            module.mem_rss_diff = 0
+            module.mem_rss_post_forward = 0
+            module.mem_rss_pre_forward = 0
+
 
 class PreTrainedModel(nn.Module, ModuleUtilsMixin):
     r""" Base class for all models.

From 03e14b2188837f6dbb019ceec2b920d274e3b6e7 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Thu, 12 Mar 2020 10:15:35 +0100
Subject: [PATCH 03/68] cleaned up tracing

---
 examples/benchmarks.py             | 155 +++++++++++++++++++----------
 src/transformers/__init__.py       |   5 +-
 src/transformers/file_utils.py     | 112 +++++++++++++++++++++
 src/transformers/modeling_gpt2.py  |   2 +-
 src/transformers/modeling_utils.py |  44 ++------
 5 files changed, 230 insertions(+), 88 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 81479f8c4fbb..de96047e4d3c 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -25,8 +25,9 @@
 import psutil
 from time import time
 from typing import List
+from collections import defaultdict
 
-from transformers import AutoConfig, AutoTokenizer, is_tf_available, is_torch_available, set_memory_tracing
+from transformers import AutoConfig, AutoTokenizer, is_tf_available, is_torch_available, start_memory_tracing, stop_memory_tracing, bytes_to_human_readable
 
 
 if is_tf_available():
@@ -265,14 +266,6 @@ def get_memory_diff(func, memory_field="rss", repeat=1):
     return output
 
 
-def memory_to_human_readable(memory_amount):
-    for unit in ['B','KB','MB','GB']:
-        if memory_amount > -1024.0 and memory_amount < 1024.0:
-            return f"{memory_amount:.3f}{unit}"
-        memory_amount /= 1024.0
-    return f"{memory_amount:.3f}TB"
-
-
 def create_setup_and_compute(
     model_names: List[str],
     batch_sizes: List[int],
@@ -280,6 +273,9 @@ def create_setup_and_compute(
     gpu: bool = True,
     tensorflow: bool = False,
     average_over: int = 3,
+    no_speed: bool = False,
+    no_memory: bool = False,
+    verbose: bool = False,
     torchscript: bool = False,
     xla: bool = False,
     amp: bool = False,
@@ -295,11 +291,11 @@ def create_setup_and_compute(
 
     if tensorflow:
         dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_tensorflow(model_names, batch_sizes, slice_sizes, dictionary, average_over, amp)
+        results = _compute_tensorflow(model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose)
     else:
         device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
         dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16)
+        results = _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16, no_speed, no_memory, verbose)
 
     print("=========== RESULTS ===========")
     for model_name in model_names:
@@ -310,9 +306,9 @@ def create_setup_and_compute(
                 result = results[model_name]["results"][batch_size][slice_size]
                 memory = results[model_name]["memory"][batch_size][slice_size]
                 if isinstance(result, str):
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result} " f"{memory_to_human_readable(memory)}")
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result} " f"{bytes_to_human_readable(memory)}")
                 else:
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s "  f"{memory_to_human_readable(memory)}")
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s "  f"{bytes_to_human_readable(memory)}")
 
     if save_to_csv:
         with open(csv_filename, mode="w") as csv_file, open(csv_memory_filename, mode="w") as csv_memory_file:
@@ -365,7 +361,7 @@ def create_setup_and_compute(
                 memory_writer.writerow({"model": model_name, **model_memory_results})
 
 
-def _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16):
+def _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16, no_speed, no_memory, verbose):
     for c, model_name in enumerate(model_names):
         print(f"{c + 1} / {len(model_names)}")
         config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
@@ -384,10 +380,7 @@ def _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_
             if fp16:
                 model.half()
             model.to(device)
-            model.train()
-
-            memory_list = set_memory_tracing('transformers')  # Line by line tracing for all code in the module `transformers`
-            model.add_memory_hooks()  # Forward methode tracing for a PyTorch model
+            model.eval()
 
             for slice_size in slice_sizes:
                 if max_input_size is not None and slice_size > max_input_size:
@@ -403,35 +396,47 @@ def _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_
                             inference = model
                             inference(sequence)
 
-                        mem_diffs = []
-                        mem_diffs_set = {}
-                        for line1, line2 in zip(memory_list[:-1], memory_list[1:]):
-                            filename, name, line_no, event, line_text, mem1 = line1
-                            _, _, _, _, _, mem2 = line2
-                            mem_diff = mem2 - mem1
-                            mem_diffs.append((filename, line_no, line_text, mem_diff))
-                            if (filename, line_no, line_text) in mem_diffs_set:
-                                mem_diffs_set[(filename, line_no, line_text)] = mem_diffs_set[(filename, line_no, line_text)] + mem_diff
-                            else:
-                                mem_diffs_set[(filename, line_no, line_text)] = mem_diff
-                            mem_str = memory_to_human_readable(mem_diff)
-                            print(f"{filename}:{line_no}: mem {mem_str}: {line_text}")
-
-                        mem_diffs = sorted(list(mem_diffs_set.items()), key=lambda x: x[1], reverse=False)
-                        for (filename, line_no, line_text), mem_diff in mem_diffs[-10:]:
-                            mem_str = memory_to_human_readable(mem_diff)
-                            print(f"---- {filename}:{line_no}: mem {mem_str}: {line_text}")
-
-                        print('sum of all lines increase', memory_to_human_readable(sum(m[1] for m in mem_diffs)))
-                        print('forward pass increase', memory_to_human_readable(model.mem_rss_diff))
-
-                        exit()
+                        if not no_memory:
+                            model.add_memory_hooks()  # Forward method tracing (only for PyTorch models)
+
+                            # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
+                            _, increased_memory = start_memory_tracing('transformers')
+                            output = inference(sequence)  # you should keep the output otherwise garbage collector will free all :-)
+                            stop_memory_tracing()
+                            del output
+                            total_memory_consumption = sum(m[1] for m in increased_memory)
+
+                            if verbose:
+                                # Print line by line memory consumption
+                                for frame, mem_increase in increased_memory:
+                                    mem_str = bytes_to_human_readable(mem_increase)
+                                    print(f"{frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}")
+
+                                # Compute cumulative memory consumption for each line (when we go several times over the same line)
+                                cumulative_increased_memory = defaultdict(lambda: 0)
+                                for frame, mem_increase in increased_memory:
+                                    cumulative_increased_memory[frame] += mem_increase
+
+                                NUM_TOP_LINES = 5 # Print the top N lines consuming the most memory
+                                print(f'\nTop {NUM_TOP_LINES} script lines consuming the most memory:')
+                                top_mem_lines = sorted(list(cumulative_increased_memory.items()), key=lambda x: x[1], reverse=True)[:NUM_TOP_LINES+1]
+                                for i, (frame, mem) in enumerate(top_mem_lines):
+                                    mem_str = bytes_to_human_readable(mem)
+                                    print(f"{i} => {frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}")
+
+                                print('\nMemory increase computed by summing traced script lines:', bytes_to_human_readable(total_memory_consumption))
+                                print('Memory increase computed by PyTorch forward pass hook:  ', bytes_to_human_readable(model.mem_rss_diff))
+                            dictionary[model_name]["memory"][batch_size][slice_size] = total_memory_consumption
+                        else:
+                            dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
 
-                        print("Going through model with sequence of shape", sequence.shape)
-                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
-                        dictionary[model_name]["memory"][batch_size][slice_size] = model.mem_rss_diff
+                        if not no_speed:
+                            print("Going through model with sequence of shape", sequence.shape)
+                            runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
+                            average_time = sum(runtimes) / float(len(runtimes)) / 3.0
+                            dictionary[model_name]["results"][batch_size][slice_size] = average_time
+                        else:
+                            dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
 
                     except RuntimeError as e:
                         print("Doesn't fit on GPU.", e)
@@ -441,7 +446,7 @@ def _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_
     return dictionary
 
 
-def _compute_tensorflow(model_names, batch_sizes, slice_sizes, dictionary, average_over, amp):
+def _compute_tensorflow(model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose):
     for c, model_name in enumerate(model_names):
         print(f"{c + 1} / {len(model_names)}")
         config = AutoConfig.from_pretrained(model_name)
@@ -476,12 +481,43 @@ def inference(inputs):
                         # To make sure that the model is traced + that the tensors are on the appropriate device
                         inference(sequence)
 
-                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                        average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
+                        if not no_memory:
+                            # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
+                            _, increased_memory = start_memory_tracing('transformers')
+                            output = inference(sequence)  # you should keep the output otherwise garbage collector will free all :-)
+                            stop_memory_tracing()
+                            del output
+                            total_memory_consumption = sum(m[1] for m in increased_memory)
+
+                            if verbose:
+                                # Print line by line memory consumption
+                                for frame, mem_increase in increased_memory:
+                                    mem_str = bytes_to_human_readable(mem_increase)
+                                    print(f"{frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}")
+
+                                # Compute cumulative memory consumption for each line (when we go several times over the same line)
+                                cumulative_increased_memory = defaultdict(lambda: 0)
+                                for frame, mem_increase in increased_memory:
+                                    cumulative_increased_memory[frame] += mem_increase
+
+                                NUM_TOP_LINES = 5 # Print the top N lines consuming the most memory
+                                print(f'\nTop {NUM_TOP_LINES} script lines consuming the most memory:')
+                                top_mem_lines = sorted(list(cumulative_increased_memory.items()), key=lambda x: x[1], reverse=True)[:NUM_TOP_LINES+1]
+                                for i, (frame, mem) in enumerate(top_mem_lines):
+                                    mem_str = bytes_to_human_readable(mem)
+                                    print(f"{i} => {frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}")
+
+                                print('\nMemory increase computed by summing traced script lines:', bytes_to_human_readable(total_memory_consumption))
+                            dictionary[model_name]["memory"][batch_size][slice_size] = total_memory_consumption
+                        else:
+                            dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
 
-                        mem_diff = get_memory_diff(lambda: inference(sequence), repeat=3)
-                        dictionary[model_name]["memory"][batch_size][slice_size] = mem_diff
+                        if not no_speed:
+                            runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
+                            average_time = sum(runtimes) / float(len(runtimes)) / 3.0
+                            dictionary[model_name]["results"][batch_size][slice_size] = average_time
+                        else:
+                            dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
 
                     except tf.errors.ResourceExhaustedError as e:
                         print("Doesn't fit on GPU.", e)
@@ -505,6 +541,15 @@ def main():
         "of all available model "
         "architectures.",
     )
+    parser.add_argument(
+        "--verbose", required=False, action="store_true", help="Verbose memory tracing"
+    )
+    parser.add_argument(
+        "--no_speed", required=False, action="store_true", help="Don't perform speed measurments"
+    )
+    parser.add_argument(
+        "--no_memory", required=False, action="store_true", help="Don't perform memory measurments"
+    )
     parser.add_argument(
         "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
     )
@@ -584,6 +629,9 @@ def main():
                 save_to_csv=args.save_to_csv,
                 csv_filename=args.csv_filename,
                 average_over=args.average_over,
+                no_speed=args.no_speed,
+                no_memory=args.no_memory,
+                verbose=args.verbose,
             )
         else:
             raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
@@ -600,6 +648,9 @@ def main():
                 save_to_csv=args.save_to_csv,
                 csv_filename=args.csv_filename,
                 average_over=args.average_over,
+                no_speed=args.no_speed,
+                no_memory=args.no_memory,
+                verbose=args.verbose,
             )
         else:
             raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0840d19382c6..4d3bd1c0e5ca 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -74,6 +74,9 @@
     cached_path,
     is_tf_available,
     is_torch_available,
+    start_memory_tracing,
+    stop_memory_tracing,
+    bytes_to_human_readable,
 )
 
 # Model Cards
@@ -136,7 +139,7 @@
 
 # Modeling
 if is_torch_available():
-    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering, set_memory_tracing
+    from .modeling_utils import PreTrainedModel, prune_layer, Conv1D, top_k_top_p_filtering
     from .modeling_auto import (
         AutoModel,
         AutoModelForPreTraining,
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index dfc6d1a8feff..13f19efb5b13 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -18,6 +18,8 @@
 from typing import Optional
 from urllib.parse import urlparse
 from zipfile import ZipFile, is_zipfile
+import linecache
+from collections import namedtuple
 
 import boto3
 import requests
@@ -496,3 +498,113 @@ def _resumable_file_manager():
             json.dump(meta, meta_file)
 
     return cache_path
+
+
+_memory_tracing_enabled = False
+
+def start_memory_tracing(module_to_trace=None, events_to_trace='line'):
+    """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
+        See `../../examples/benchmarks.py for a usage example.
+
+        Args:
+            - `module_to_trace`: if not None, string indicating the module or sub-module to trace,
+                only events from this module will be recorded (e.g. 'transformers' or 'transformers.modeling_gpt2')
+                If None, all events are recorded
+            - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
+                default to line
+
+        Return: Tuple with two lists which will be updated during tracing:
+            - `used_memory_list` is a list of `UsedMemoryState` for each event (default each line of the traced script).
+                - `UsedMemoryState` are named tuples with the following fields:
+                    - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+                    - 'rss_memory': RSS memory state *before* executing the line
+
+                - `IncreasedMemoryState` are named tuples similar to `UsedMemoryState` but recording the *increase* in memory after executing each line.
+                    They have the following fields:
+                    - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+                    - 'rss_memory_increase': RSS memory *increase* *after* executing the line
+
+                `Frame` namedtuple are used by `UsedMemoryState` and `IncreasedMemoryState` to list the current frame state.
+                    The have the following fields:
+                    - 'filename' (string): Name of the file currently executed
+                    - 'module' (string): Name of the module currently executed
+                    - 'line_number' (int): Number of the line currently executed
+                    - 'event' (string): Event that triggered the tracing (default will be "line")
+                    - 'line_text' (string): Text of the line in the python script
+
+    """
+    try:
+        import psutil
+    except (ImportError):
+        raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
+
+    Frame = namedtuple('Frame', ['filename', 'module', 'line_number', 'event', 'line_text'])
+    UsedMemoryState = namedtuple('UsedMemoryState', ['frame', 'rss_memory'])
+    IncreasedMemoryState = namedtuple('IncreasedMemoryState', ['frame', 'rss_memory_increase'])
+    used_memory_list, increased_memory_list = [], []
+    process = psutil.Process(os.getpid())
+
+    def traceit(frame, event, args):
+        """ Tracing method executed before running each line in a module or sub-module
+            Record memory allocated in a list with debugging information
+        """
+        global _memory_tracing_enabled
+
+        if not _memory_tracing_enabled:
+            return traceit
+
+        name = frame.f_globals["__name__"]
+        if events_to_trace is not None:
+            if isinstance(events_to_trace, str) and event != events_to_trace:
+                return traceit
+            elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace:
+                return traceit
+            elif not isinstance(name, str):
+                return traceit
+            elif module_to_trace is not None and module_to_trace not in name:
+                return traceit
+
+        # Record current tracing state (file, location in file...)
+        lineno = frame.f_lineno
+        filename = frame.f_globals["__file__"]
+        if (filename.endswith(".pyc") or
+            filename.endswith(".pyo")):
+            filename = filename[:-1]
+        line = linecache.getline(filename, lineno).rstrip()
+
+        # Record current memory state (rss memory) and compute difference with previous memory state
+        mem = process.memory_info()
+        traced_state = Frame(filename, name, lineno, event, line)
+        mem_state = UsedMemoryState(traced_state, mem.rss)
+        used_memory_list.append(mem_state)
+
+        if len(used_memory_list) > 1:
+            prev_frame, prev_mem = used_memory_list[-2]
+            increased_mem_state = IncreasedMemoryState(prev_frame, mem.rss - prev_mem)
+            increased_memory_list.append(increased_mem_state)
+
+        return traceit
+
+    sys.settrace(traceit)
+
+    global _memory_tracing_enabled
+    _memory_tracing_enabled = True
+
+    return used_memory_list, increased_memory_list
+
+
+def stop_memory_tracing():
+    """ Stop memory tracing cleanly
+    """
+    global _memory_tracing_enabled
+    _memory_tracing_enabled = False
+
+
+def bytes_to_human_readable(memory_amount):
+    """ Utility to convert a number of bytes (int) in a human readable string (with units)
+    """
+    for unit in ['B','KB','MB','GB']:
+        if memory_amount > -1024.0 and memory_amount < 1024.0:
+            return f"{memory_amount:.3f}{unit}"
+        memory_amount /= 1024.0
+    return f"{memory_amount:.3f}TB"
diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
index b492d7fc374b..657b4764ed13 100644
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -24,7 +24,7 @@
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
-from .activations import gelu_new
+from .activations import gelu_new, gelu
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 5047fa5461a4..d4fa38d64a75 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -18,9 +18,6 @@
 import logging
 import os
 import typing
-import sys
-import psutil
-import linecache
 
 import torch
 from torch import nn
@@ -38,40 +35,9 @@
     hf_bucket_url,
     is_remote_url,
 )
-
-
 logger = logging.getLogger(__name__)
 
 
-def set_memory_tracing(module_to_trace):
-    """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
-        See `../../examples/benchmarks.py for a usage example.
-    """
-    memory_list = []
-    process = psutil.Process(os.getpid())
-
-    def traceit(frame, event, arg):
-        """ Tracing method executed before running each line in a module or sub-module
-            Record memory allocated in a list with debugging information
-        """
-        name = frame.f_globals["__name__"]
-        if event != "line" or (not isinstance(name, str)) or (module_to_trace not in name):
-            return traceit
-
-        lineno = frame.f_lineno
-        filename = frame.f_globals["__file__"]
-        if (filename.endswith(".pyc") or
-            filename.endswith(".pyo")):
-            filename = filename[:-1]
-        line = linecache.getline(filename, lineno).rstrip()
-        mem = process.memory_info()
-        memory_list.append((filename, name, lineno, event, line, mem.rss))
-        return traceit
-
-    sys.settrace(traceit)
-    return memory_list
-
-
 try:
     from torch.nn import Identity
 except ImportError:
@@ -101,6 +67,11 @@ def num_parameters(self, only_trainable: bool = False) -> int:
 
     @staticmethod
     def _hook_rss_memory_pre_forward(module, *args, **kwargs):
+        try:
+            import psutil
+        except (ImportError):
+            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
+
         process = psutil.Process(os.getpid())
         mem = process.memory_info()
         module.mem_rss_pre_forward = mem.rss
@@ -109,6 +80,11 @@ def _hook_rss_memory_pre_forward(module, *args, **kwargs):
 
     @staticmethod
     def _hook_rss_memory_post_forward(module, *args, **kwargs):
+        try:
+            import psutil
+        except (ImportError):
+            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
+
         process = psutil.Process(os.getpid())
         mem = process.memory_info()
         module.mem_rss_post_forward = mem.rss

From f77d7d909b50875f5203e112ec26e56d7637e2b6 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Thu, 12 Mar 2020 14:19:20 +0100
Subject: [PATCH 04/68] refactored and cleaning up API

---
 examples/benchmarks.py                 | 172 +++++++++++++++----------
 src/transformers/__init__.py           |   2 +-
 src/transformers/configuration_gpt2.py |   4 +
 src/transformers/file_utils.py         | 123 +++++++++++++-----
 src/transformers/modeling_gpt2.py      |   4 +-
 src/transformers/modeling_utils.py     |   8 +-
 6 files changed, 203 insertions(+), 110 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index de96047e4d3c..0b571a8cec17 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -18,16 +18,24 @@
 # If checking the tensors placement
 # tf.debugging.set_log_device_placement(True)
 
-import os
 import argparse
 import csv
+import os
 import timeit
-import psutil
 from time import time
 from typing import List
-from collections import defaultdict
 
-from transformers import AutoConfig, AutoTokenizer, is_tf_available, is_torch_available, start_memory_tracing, stop_memory_tracing, bytes_to_human_readable
+import psutil
+
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    bytes_to_human_readable,
+    is_tf_available,
+    is_torch_available,
+    start_memory_tracing,
+    stop_memory_tracing,
+)
 
 
 if is_tf_available():
@@ -291,11 +299,25 @@ def create_setup_and_compute(
 
     if tensorflow:
         dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_tensorflow(model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose)
+        results = _compute_tensorflow(
+            model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose
+        )
     else:
         device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
         dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16, no_speed, no_memory, verbose)
+        results = _compute_pytorch(
+            model_names,
+            batch_sizes,
+            slice_sizes,
+            dictionary,
+            average_over,
+            device,
+            torchscript,
+            fp16,
+            no_speed,
+            no_memory,
+            verbose,
+        )
 
     print("=========== RESULTS ===========")
     for model_name in model_names:
@@ -306,9 +328,18 @@ def create_setup_and_compute(
                 result = results[model_name]["results"][batch_size][slice_size]
                 memory = results[model_name]["memory"][batch_size][slice_size]
                 if isinstance(result, str):
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result} " f"{bytes_to_human_readable(memory)}")
+                    print(
+                        f"\t\t{model_name}/{batch_size}/{slice_size}: "
+                        f"{result} "
+                        f"{bytes_to_human_readable(memory)}"
+                    )
                 else:
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s "  f"{bytes_to_human_readable(memory)}")
+                    print(
+                        f"\t\t{model_name}/{batch_size}/{slice_size}: "
+                        f"{(round(1000 * result) / 1000)}"
+                        f"s "
+                        f"{bytes_to_human_readable(memory)}"
+                    )
 
     if save_to_csv:
         with open(csv_filename, mode="w") as csv_file, open(csv_memory_filename, mode="w") as csv_memory_file:
@@ -361,7 +392,19 @@ def create_setup_and_compute(
                 memory_writer.writerow({"model": model_name, **model_memory_results})
 
 
-def _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_over, device, torchscript, fp16, no_speed, no_memory, verbose):
+def _compute_pytorch(
+    model_names,
+    batch_sizes,
+    slice_sizes,
+    dictionary,
+    average_over,
+    device,
+    torchscript,
+    fp16,
+    no_speed,
+    no_memory,
+    verbose,
+):
     for c, model_name in enumerate(model_names):
         print(f"{c + 1} / {len(model_names)}")
         config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
@@ -397,36 +440,31 @@ def _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_
                             inference(sequence)
 
                         if not no_memory:
-                            model.add_memory_hooks()  # Forward method tracing (only for PyTorch models)
+                            # model.add_memory_hooks()  # Forward method tracing (only for PyTorch models)
 
                             # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
-                            _, increased_memory = start_memory_tracing('transformers')
-                            output = inference(sequence)  # you should keep the output otherwise garbage collector will free all :-)
-                            stop_memory_tracing()
-                            del output
-                            total_memory_consumption = sum(m[1] for m in increased_memory)
+                            trace = start_memory_tracing("transformers")
+                            output = inference(sequence)  # noqa: F841
+                            summary = stop_memory_tracing(trace)
+                            del output  # you should keep the output and delete it after otherwise garbage collector will free all :-)
 
                             if verbose:
-                                # Print line by line memory consumption
-                                for frame, mem_increase in increased_memory:
-                                    mem_str = bytes_to_human_readable(mem_increase)
-                                    print(f"{frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}")
-
-                                # Compute cumulative memory consumption for each line (when we go several times over the same line)
-                                cumulative_increased_memory = defaultdict(lambda: 0)
-                                for frame, mem_increase in increased_memory:
-                                    cumulative_increased_memory[frame] += mem_increase
-
-                                NUM_TOP_LINES = 5 # Print the top N lines consuming the most memory
-                                print(f'\nTop {NUM_TOP_LINES} script lines consuming the most memory:')
-                                top_mem_lines = sorted(list(cumulative_increased_memory.items()), key=lambda x: x[1], reverse=True)[:NUM_TOP_LINES+1]
-                                for i, (frame, mem) in enumerate(top_mem_lines):
-                                    mem_str = bytes_to_human_readable(mem)
-                                    print(f"{i} => {frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}")
-
-                                print('\nMemory increase computed by summing traced script lines:', bytes_to_human_readable(total_memory_consumption))
-                                print('Memory increase computed by PyTorch forward pass hook:  ', bytes_to_human_readable(model.mem_rss_diff))
-                            dictionary[model_name]["memory"][batch_size][slice_size] = total_memory_consumption
+                                print(
+                                    "\nLines by line memory consumption:\n"
+                                    + "\n".join(
+                                        f"{frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}"
+                                        for frame, _, mem_str in summary.sequential
+                                    )
+                                )
+                                print(
+                                    "\nLines with top memory consumption:\n"
+                                    + "\n".join(
+                                        f"=> {frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}"
+                                        for frame, _, mem_str in summary.cumulative[:6]
+                                    )
+                                )
+                                print(f"\nTotal memory increase: {summary.total.string}")
+                            dictionary[model_name]["memory"][batch_size][slice_size] = summary.total.bytes
                         else:
                             dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
 
@@ -446,7 +484,9 @@ def _compute_pytorch(model_names, batch_sizes, slice_sizes, dictionary, average_
     return dictionary
 
 
-def _compute_tensorflow(model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose):
+def _compute_tensorflow(
+    model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose
+):
     for c, model_name in enumerate(model_names):
         print(f"{c + 1} / {len(model_names)}")
         config = AutoConfig.from_pretrained(model_name)
@@ -483,32 +523,28 @@ def inference(inputs):
 
                         if not no_memory:
                             # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
-                            _, increased_memory = start_memory_tracing('transformers')
-                            output = inference(sequence)  # you should keep the output otherwise garbage collector will free all :-)
-                            stop_memory_tracing()
-                            del output
-                            total_memory_consumption = sum(m[1] for m in increased_memory)
+                            trace = start_memory_tracing("transformers")
+                            output = inference(sequence)  # noqa: F841
+                            summary = stop_memory_tracing(trace)
+                            del output  # you should keep the output and delete it after otherwise garbage collector will free all :-)
 
                             if verbose:
-                                # Print line by line memory consumption
-                                for frame, mem_increase in increased_memory:
-                                    mem_str = bytes_to_human_readable(mem_increase)
-                                    print(f"{frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}")
-
-                                # Compute cumulative memory consumption for each line (when we go several times over the same line)
-                                cumulative_increased_memory = defaultdict(lambda: 0)
-                                for frame, mem_increase in increased_memory:
-                                    cumulative_increased_memory[frame] += mem_increase
-
-                                NUM_TOP_LINES = 5 # Print the top N lines consuming the most memory
-                                print(f'\nTop {NUM_TOP_LINES} script lines consuming the most memory:')
-                                top_mem_lines = sorted(list(cumulative_increased_memory.items()), key=lambda x: x[1], reverse=True)[:NUM_TOP_LINES+1]
-                                for i, (frame, mem) in enumerate(top_mem_lines):
-                                    mem_str = bytes_to_human_readable(mem)
-                                    print(f"{i} => {frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}")
-
-                                print('\nMemory increase computed by summing traced script lines:', bytes_to_human_readable(total_memory_consumption))
-                            dictionary[model_name]["memory"][batch_size][slice_size] = total_memory_consumption
+                                print(
+                                    "\nLines by line memory consumption:\n"
+                                    + "\n".join(
+                                        f"{frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}"
+                                        for frame, _, mem_str in summary.sequential
+                                    )
+                                )
+                                print(
+                                    "\nLines with top memory consumption:\n"
+                                    + "\n".join(
+                                        f"=> {frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}"
+                                        for frame, _, mem_str in summary.cumulative[:6]
+                                    )
+                                )
+                                print(f"\nTotal memory increase: {summary.total.string}")
+                            dictionary[model_name]["memory"][batch_size][slice_size] = summary.total.bytes
                         else:
                             dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
 
@@ -541,15 +577,9 @@ def main():
         "of all available model "
         "architectures.",
     )
-    parser.add_argument(
-        "--verbose", required=False, action="store_true", help="Verbose memory tracing"
-    )
-    parser.add_argument(
-        "--no_speed", required=False, action="store_true", help="Don't perform speed measurments"
-    )
-    parser.add_argument(
-        "--no_memory", required=False, action="store_true", help="Don't perform memory measurments"
-    )
+    parser.add_argument("--verbose", required=False, action="store_true", help="Verbose memory tracing")
+    parser.add_argument("--no_speed", required=False, action="store_true", help="Don't perform speed measurments")
+    parser.add_argument("--no_memory", required=False, action="store_true", help="Don't perform memory measurments")
     parser.add_argument(
         "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models"
     )
@@ -594,8 +624,8 @@ def main():
     parser.add_argument(
         "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
     )
-    parser.add_argument('--batch_sizes', nargs='+', type=int, default=[1, 2, 4, 8])
-    parser.add_argument('--slice_sizes', nargs='+', type=int, default=[8, 64, 128, 256, 512, 1024])
+    parser.add_argument("--batch_sizes", nargs="+", type=int, default=[1, 2, 4, 8])
+    parser.add_argument("--slice_sizes", nargs="+", type=int, default=[8, 64, 128, 256, 512, 1024])
 
     args = parser.parse_args()
     if args.models == "all":
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4d3bd1c0e5ca..e6b634eaadaa 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -71,12 +71,12 @@
     WEIGHTS_NAME,
     add_end_docstrings,
     add_start_docstrings,
+    bytes_to_human_readable,
     cached_path,
     is_tf_available,
     is_torch_available,
     start_memory_tracing,
     stop_memory_tracing,
-    bytes_to_human_readable,
 )
 
 # Model Cards
diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py
index 4957e9fd1049..1f2352a6c96e 100644
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -59,6 +59,8 @@ class GPT2Config(PretrainedConfig):
                 Number of hidden layers in the Transformer encoder.
             n_head (:obj:`int`, optional, defaults to 12):
                 Number of attention heads for each attention layer in the Transformer encoder.
+            activation_function (:obj:`str`, optional, defaults to 'gelu'):
+                Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"].
             resid_pdrop (:obj:`float`, optional, defaults to 0.1):
                 The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
             embd_pdrop (:obj:`int`, optional, defaults to 0.1):
@@ -125,6 +127,7 @@ def __init__(
         n_embd=768,
         n_layer=12,
         n_head=12,
+        activation_function="gelu_new",
         resid_pdrop=0.1,
         embd_pdrop=0.1,
         attn_pdrop=0.1,
@@ -147,6 +150,7 @@ def __init__(
         self.n_embd = n_embd
         self.n_layer = n_layer
         self.n_head = n_head
+        self.activation_function = activation_function
         self.resid_pdrop = resid_pdrop
         self.embd_pdrop = embd_pdrop
         self.attn_pdrop = attn_pdrop
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 13f19efb5b13..c928d8f7e787 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -6,20 +6,20 @@
 
 import fnmatch
 import json
+import linecache
 import logging
 import os
 import shutil
 import sys
 import tarfile
 import tempfile
+from collections import defaultdict, namedtuple
 from contextlib import contextmanager
 from functools import partial, wraps
 from hashlib import sha256
 from typing import Optional
 from urllib.parse import urlparse
 from zipfile import ZipFile, is_zipfile
-import linecache
-from collections import namedtuple
 
 import boto3
 import requests
@@ -501,15 +501,25 @@ def _resumable_file_manager():
 
 
 _memory_tracing_enabled = False
+Frame = namedtuple("Frame", ["filename", "module", "line_number", "event", "line_text"])
+UsedMemoryState = namedtuple("UsedMemoryState", ["frame", "rss_memory"])
+IncreasedMemoryState = namedtuple("IncreasedMemoryState", ["frame", "rss_memory_increase"])
+
 
-def start_memory_tracing(module_to_trace=None, events_to_trace='line'):
+def start_memory_tracing(modules_to_trace=None, modules_not_to_trace=None, events_to_trace="line"):
     """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
         See `../../examples/benchmarks.py for a usage example.
+        Current memory consumption is returned using psutil and in particular is the RSS memory
+            "Resident Set Size” (the non-swapped physical memory the process is using).
+            See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
 
         Args:
-            - `module_to_trace`: if not None, string indicating the module or sub-module to trace,
-                only events from this module will be recorded (e.g. 'transformers' or 'transformers.modeling_gpt2')
-                If None, all events are recorded
+            - `modules_to_trace`: (None, string, list/tuple of string)
+                if None, all events are recorded
+                if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
+            - `modules_not_to_trace`: (None, string, list/tuple of string)
+                if None, no module is avoided
+                if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
             - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
                 default to line
 
@@ -519,12 +529,7 @@ def start_memory_tracing(module_to_trace=None, events_to_trace='line'):
                     - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
                     - 'rss_memory': RSS memory state *before* executing the line
 
-                - `IncreasedMemoryState` are named tuples similar to `UsedMemoryState` but recording the *increase* in memory after executing each line.
-                    They have the following fields:
-                    - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
-                    - 'rss_memory_increase': RSS memory *increase* *after* executing the line
-
-                `Frame` namedtuple are used by `UsedMemoryState` and `IncreasedMemoryState` to list the current frame state.
+                `Frame` namedtuple used by `UsedMemoryState` to list the current frame state.
                     The have the following fields:
                     - 'filename' (string): Name of the file currently executed
                     - 'module' (string): Name of the module currently executed
@@ -538,10 +543,7 @@ def start_memory_tracing(module_to_trace=None, events_to_trace='line'):
     except (ImportError):
         raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
 
-    Frame = namedtuple('Frame', ['filename', 'module', 'line_number', 'event', 'line_text'])
-    UsedMemoryState = namedtuple('UsedMemoryState', ['frame', 'rss_memory'])
-    IncreasedMemoryState = namedtuple('IncreasedMemoryState', ['frame', 'rss_memory_increase'])
-    used_memory_list, increased_memory_list = [], []
+    memory_trace = []
     process = psutil.Process(os.getpid())
 
     def traceit(frame, event, args):
@@ -553,22 +555,36 @@ def traceit(frame, event, args):
         if not _memory_tracing_enabled:
             return traceit
 
-        name = frame.f_globals["__name__"]
+        # Filter events
         if events_to_trace is not None:
             if isinstance(events_to_trace, str) and event != events_to_trace:
                 return traceit
             elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace:
                 return traceit
-            elif not isinstance(name, str):
-                return traceit
-            elif module_to_trace is not None and module_to_trace not in name:
-                return traceit
+
+        # Filter modules
+        name = frame.f_globals["__name__"]
+        if not isinstance(name, str):
+            return traceit
+        else:
+            # Filter whitelist of modules to trace
+            if modules_to_trace is not None:
+                if isinstance(modules_to_trace, str) and modules_to_trace not in name:
+                    return traceit
+                elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace):
+                    return traceit
+
+            # Filter blacklist of modules not to trace
+            if modules_not_to_trace is not None:
+                if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name:
+                    return traceit
+                elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace):
+                    return traceit
 
         # Record current tracing state (file, location in file...)
         lineno = frame.f_lineno
         filename = frame.f_globals["__file__"]
-        if (filename.endswith(".pyc") or
-            filename.endswith(".pyo")):
+        if filename.endswith(".pyc") or filename.endswith(".pyo"):
             filename = filename[:-1]
         line = linecache.getline(filename, lineno).rstrip()
 
@@ -576,12 +592,7 @@ def traceit(frame, event, args):
         mem = process.memory_info()
         traced_state = Frame(filename, name, lineno, event, line)
         mem_state = UsedMemoryState(traced_state, mem.rss)
-        used_memory_list.append(mem_state)
-
-        if len(used_memory_list) > 1:
-            prev_frame, prev_mem = used_memory_list[-2]
-            increased_mem_state = IncreasedMemoryState(prev_frame, mem.rss - prev_mem)
-            increased_memory_list.append(increased_mem_state)
+        memory_trace.append(mem_state)
 
         return traceit
 
@@ -590,20 +601,66 @@ def traceit(frame, event, args):
     global _memory_tracing_enabled
     _memory_tracing_enabled = True
 
-    return used_memory_list, increased_memory_list
+    return memory_trace
+
+
+TraceMemoryIncrease = namedtuple("TraceMemoryIncrease", ["frame", "bytes", "string"])
+TotalMemoryIncrease = namedtuple("TotalMemoryIncrease", ["bytes", "string"])
+MemorySummary = namedtuple("MemorySummary", ["sequential", "cumulative", "total"])
+
 
+def stop_memory_tracing(memory_trace=None):
+    """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
 
-def stop_memory_tracing():
-    """ Stop memory tracing cleanly
+        Args:
+            - memory_trace (optional, default: None): memmory trace to convert in summary
+
+        Return:
+            - None if `memory_trace` is None
+            - `MemorySummary` namedtuple otherwise with the fields:
+                - `sequential`: the list of tuple (Frame, memory increase) computed from the memory_trace list
+                    by substracting the memory after executing each line from the memory before executing said line.
+                - `cumulative`: an OrderedDict of (Frame, cumulative memory increase for the line) with the cumulative increase in memory for each line
+                    (summing repeted memory increase for a line if it's executed several times).
+                    The dictionnary is ordered from the line with the largest memory consumption to the line with the smallest (can be negative if memory is free)
+                - `total`: total memory increaseduring the tracing.
+
+        In the `MemorySummary`, frames are `Frame` namedtuple used to list the current frame state. A `Frame` has the following fields:
+            - 'filename' (string): Name of the file currently executed
+            - 'module' (string): Name of the module currently executed
+            - 'line_number' (int): Number of the line currently executed
+            - 'event' (string): Event that triggered the tracing (default will be "line")
+            - 'line_text' (string): Text of the line in the python script
     """
     global _memory_tracing_enabled
     _memory_tracing_enabled = False
 
+    if memory_trace is not None and len(memory_trace) > 1:
+        memory_diff_trace = []
+        cumulative_memory_dict = defaultdict(lambda: 0)
+        for (frame, mem), (next_frame, next_mem) in zip(memory_trace[:-1], memory_trace[1:]):
+            mem_inc = next_mem - mem
+            mem_str = bytes_to_human_readable(mem_inc)
+            memory_diff_trace.append(TraceMemoryIncrease(frame=frame, bytes=mem_inc, string=mem_str))
+            cumulative_memory_dict[frame] += mem_inc
+
+        cumulative_memory = sorted(list(cumulative_memory_dict.items()), key=lambda x: x[1], reverse=True)
+        cumulative_memory = list(
+            TraceMemoryIncrease(frame=frame, bytes=mem_inc, string=bytes_to_human_readable(mem_inc))
+            for frame, mem_inc in cumulative_memory
+        )
+
+        total_memory = sum(step_trace.bytes for step_trace in memory_diff_trace)
+        total_memory = TotalMemoryIncrease(bytes=total_memory, string=bytes_to_human_readable(total_memory))
+        return MemorySummary(sequential=memory_diff_trace, cumulative=cumulative_memory, total=total_memory)
+
+    return None
+
 
 def bytes_to_human_readable(memory_amount):
     """ Utility to convert a number of bytes (int) in a human readable string (with units)
     """
-    for unit in ['B','KB','MB','GB']:
+    for unit in ["B", "KB", "MB", "GB"]:
         if memory_amount > -1024.0 and memory_amount < 1024.0:
             return f"{memory_amount:.3f}{unit}"
         memory_amount /= 1024.0
diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
index 657b4764ed13..04a95eff2897 100644
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -24,7 +24,7 @@
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
-from .activations import gelu_new, gelu
+from .activations import ACT2FN
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
@@ -203,7 +203,7 @@ def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
         nx = config.n_embd
         self.c_fc = Conv1D(n_state, nx)
         self.c_proj = Conv1D(nx, n_state)
-        self.act = gelu_new
+        self.act = ACT2FN[config.activation_function]
         self.dropout = nn.Dropout(config.resid_pdrop)
 
     def forward(self, x):
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index d4fa38d64a75..60bc559a6e86 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -35,6 +35,8 @@
     hf_bucket_url,
     is_remote_url,
 )
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -77,7 +79,6 @@ def _hook_rss_memory_pre_forward(module, *args, **kwargs):
         module.mem_rss_pre_forward = mem.rss
         return None
 
-
     @staticmethod
     def _hook_rss_memory_post_forward(module, *args, **kwargs):
         try:
@@ -89,7 +90,7 @@ def _hook_rss_memory_post_forward(module, *args, **kwargs):
         mem = process.memory_info()
         module.mem_rss_post_forward = mem.rss
         mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward
-        module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, 'mem_rss_diff') else 0)
+        module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0)
         return None
 
     def add_memory_hooks(self):
@@ -99,9 +100,10 @@ def add_memory_hooks(self):
         for module in self.modules():
             module.register_forward_pre_hook(self._hook_rss_memory_pre_forward)
             module.register_forward_hook(self._hook_rss_memory_post_forward)
+        self.reset_memory_hooks_state()
 
     def reset_memory_hooks_state(self):
-        for module in self.modules:
+        for module in self.modules():
             module.mem_rss_diff = 0
             module.mem_rss_post_forward = 0
             module.mem_rss_pre_forward = 0

From 88ea59fab338af3814237c2118d9f5aa2d776112 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Thu, 12 Mar 2020 14:43:10 +0100
Subject: [PATCH 05/68] no f-strings yet...

---
 src/transformers/file_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index c928d8f7e787..036b37cac432 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -662,6 +662,6 @@ def bytes_to_human_readable(memory_amount):
     """
     for unit in ["B", "KB", "MB", "GB"]:
         if memory_amount > -1024.0 and memory_amount < 1024.0:
-            return f"{memory_amount:.3f}{unit}"
+            return "{:.3f}{}".format(memory_amount, unit)
         memory_amount /= 1024.0
-    return f"{memory_amount:.3f}TB"
+    return "{:.3f}TB".format(memory_amount)

From f46ff48a9e83fa51f3e29d0470fdc7d68a4a3b88 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Thu, 12 Mar 2020 16:04:48 +0100
Subject: [PATCH 06/68] add GPU mem logging

---
 examples/benchmarks.py         |  8 +--
 src/transformers/file_utils.py | 93 +++++++++++++++++++++++++---------
 2 files changed, 72 insertions(+), 29 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 0b571a8cec17..7a0800a09dbf 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -452,15 +452,15 @@ def _compute_pytorch(
                                 print(
                                     "\nLines by line memory consumption:\n"
                                     + "\n".join(
-                                        f"{frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}"
-                                        for frame, _, mem_str in summary.sequential
+                                        f"{frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                                        for frame, _, _, cpu_gpu_mem in summary.sequential
                                     )
                                 )
                                 print(
                                     "\nLines with top memory consumption:\n"
                                     + "\n".join(
-                                        f"=> {frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}"
-                                        for frame, _, mem_str in summary.cumulative[:6]
+                                        f"=> {frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                                        for frame, _, _, cpu_gpu_mem in summary.cumulative[:6]
                                     )
                                 )
                                 print(f"\nTotal memory increase: {summary.total.string}")
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 036b37cac432..5fb995bb1c98 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -502,11 +502,10 @@ def _resumable_file_manager():
 
 _memory_tracing_enabled = False
 Frame = namedtuple("Frame", ["filename", "module", "line_number", "event", "line_text"])
-UsedMemoryState = namedtuple("UsedMemoryState", ["frame", "rss_memory"])
-IncreasedMemoryState = namedtuple("IncreasedMemoryState", ["frame", "rss_memory_increase"])
+UsedMemoryState = namedtuple("UsedMemoryState", ["frame", "cpu_memory", "gpu_memory"])
 
 
-def start_memory_tracing(modules_to_trace=None, modules_not_to_trace=None, events_to_trace="line"):
+def start_memory_tracing(modules_to_trace=None, modules_not_to_trace=None, events_to_trace="line", gpus_to_trace=None):
     """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
         See `../../examples/benchmarks.py for a usage example.
         Current memory consumption is returned using psutil and in particular is the RSS memory
@@ -527,7 +526,8 @@ def start_memory_tracing(modules_to_trace=None, modules_not_to_trace=None, event
             - `used_memory_list` is a list of `UsedMemoryState` for each event (default each line of the traced script).
                 - `UsedMemoryState` are named tuples with the following fields:
                     - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
-                    - 'rss_memory': RSS memory state *before* executing the line
+                    - 'cpu_memory': RSS memory state *before* executing the line
+                    - 'gpu_memory': Used GPU memory *before* executing the line
 
                 `Frame` namedtuple used by `UsedMemoryState` to list the current frame state.
                     The have the following fields:
@@ -541,10 +541,31 @@ def start_memory_tracing(modules_to_trace=None, modules_not_to_trace=None, event
     try:
         import psutil
     except (ImportError):
-        raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
+        logger.warning("Psutil not installed, we won't log CPU memory usage. "
+                       "Install psutil (pip install psutil) to use CPU memory tracing.")
+        process = None
+    else:
+        process = psutil.Process(os.getpid())
+
+    try:
+        from py3nvml import py3nvml
+        py3nvml.nvmlInit()
+    except ImportError:
+        logger.warning("py3nvml not installed, we won't log GPU memory usage. "
+                       "Install py3nvml (pip install py3nvml) to use GPU memory tracing.")
+        gpu_handle = None
+    except:
+        logger.warning("Error while initializing comunication with GPU. "
+                       "We won't perform GPU memory tracing.")
+        gpu_handle = None
+    else:
+        if gpus_to_trace is None:
+            deviceCount = py3nvml.nvmlDeviceGetCount()
+            gpu_handle = list(py3nvml.nvmlDeviceGetHandleByIndex(i) for i in range(deviceCount))
+        else:
+            gpu_handle = list(py3nvml.nvmlDeviceGetHandleByIndex(i) for i in range(gpus_to_trace))
 
     memory_trace = []
-    process = psutil.Process(os.getpid())
 
     def traceit(frame, event, args):
         """ Tracing method executed before running each line in a module or sub-module
@@ -587,11 +608,21 @@ def traceit(frame, event, args):
         if filename.endswith(".pyc") or filename.endswith(".pyo"):
             filename = filename[:-1]
         line = linecache.getline(filename, lineno).rstrip()
+        traced_state = Frame(filename, name, lineno, event, line)
 
         # Record current memory state (rss memory) and compute difference with previous memory state
-        mem = process.memory_info()
-        traced_state = Frame(filename, name, lineno, event, line)
-        mem_state = UsedMemoryState(traced_state, mem.rss)
+        cpu_mem = 0
+        if process is not None:
+            mem = process.memory_info()
+            cpu_mem = mem.rss
+
+        gpu_mem = 0
+        if gpu_handle is not None:
+            for handle in gpu_handle:
+                meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
+                gpu_mem += meminfo.used
+
+        mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)
         memory_trace.append(mem_state)
 
         return traceit
@@ -603,9 +634,9 @@ def traceit(frame, event, args):
 
     return memory_trace
 
-
-TraceMemoryIncrease = namedtuple("TraceMemoryIncrease", ["frame", "bytes", "string"])
-TotalMemoryIncrease = namedtuple("TotalMemoryIncrease", ["bytes", "string"])
+Memory = namedtuple("Memory", ["bytes", "string"])
+CPUGPUMemory = namedtuple("TotalMemoryIncrease", ["cpu", "gpu", "cpu_gpu"])
+TraceCPUGPUMemory = namedtuple("TraceMemoryIncrease", ["frame", "cpu", "gpu", "cpu_gpu"])
 MemorySummary = namedtuple("MemorySummary", ["sequential", "cumulative", "total"])
 
 
@@ -623,7 +654,7 @@ def stop_memory_tracing(memory_trace=None):
                 - `cumulative`: an OrderedDict of (Frame, cumulative memory increase for the line) with the cumulative increase in memory for each line
                     (summing repeted memory increase for a line if it's executed several times).
                     The dictionnary is ordered from the line with the largest memory consumption to the line with the smallest (can be negative if memory is free)
-                - `total`: total memory increaseduring the tracing.
+                - `total`: total memory increase during the full tracing.
 
         In the `MemorySummary`, frames are `Frame` namedtuple used to list the current frame state. A `Frame` has the following fields:
             - 'filename' (string): Name of the file currently executed
@@ -637,21 +668,33 @@ def stop_memory_tracing(memory_trace=None):
 
     if memory_trace is not None and len(memory_trace) > 1:
         memory_diff_trace = []
-        cumulative_memory_dict = defaultdict(lambda: 0)
-        for (frame, mem), (next_frame, next_mem) in zip(memory_trace[:-1], memory_trace[1:]):
-            mem_inc = next_mem - mem
-            mem_str = bytes_to_human_readable(mem_inc)
-            memory_diff_trace.append(TraceMemoryIncrease(frame=frame, bytes=mem_inc, string=mem_str))
-            cumulative_memory_dict[frame] += mem_inc
-
-        cumulative_memory = sorted(list(cumulative_memory_dict.items()), key=lambda x: x[1], reverse=True)
+        cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
+        for (frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem) in zip(memory_trace[:-1], memory_trace[1:]):
+            cpu_mem_inc = next_cpu_mem - cpu_mem
+            cpu_mem_str = bytes_to_human_readable(cpu_mem_inc)
+            gpu_mem_inc = next_gpu_mem - gpu_mem
+            gpu_mem_str = bytes_to_human_readable(gpu_mem_inc)
+            cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
+            cpu_gpu_mem_str = bytes_to_human_readable(cpu_gpu_mem_inc)
+            memory_diff_trace.append(TraceCPUGPUMemory(frame=frame,
+                                                       cpu=Memory(cpu_mem_inc, cpu_mem_str),
+                                                       gpu=Memory(gpu_mem_inc, gpu_mem_str),
+                                                       cpu_gpu=Memory(cpu_gpu_mem_inc, cpu_gpu_mem_str)))
+            cumulative_memory_dict[frame][0] += cpu_mem_inc
+            cumulative_memory_dict[frame][1] += gpu_mem_inc
+            cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc
+
+        cumulative_memory = sorted(list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True)  # order by the total CPU + GPU memory increase
         cumulative_memory = list(
-            TraceMemoryIncrease(frame=frame, bytes=mem_inc, string=bytes_to_human_readable(mem_inc))
-            for frame, mem_inc in cumulative_memory
+            TraceCPUGPUMemory(frame=frame,
+                              cpu=Memory(cpu_mem_inc, bytes_to_human_readable(cpu_mem_inc)),
+                              gpu=Memory(gpu_mem_inc, bytes_to_human_readable(gpu_mem_inc)),
+                              cpu_gpu=Memory(cpu_gpu_mem_inc, bytes_to_human_readable(cpu_gpu_mem_inc)))
+            for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
         )
 
-        total_memory = sum(step_trace.bytes for step_trace in memory_diff_trace)
-        total_memory = TotalMemoryIncrease(bytes=total_memory, string=bytes_to_human_readable(total_memory))
+        total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace)
+        total_memory = Memory(bytes=total_memory, string=bytes_to_human_readable(total_memory))
         return MemorySummary(sequential=memory_diff_trace, cumulative=cumulative_memory, total=total_memory)
 
     return None

From e9182b477a589441fad98f1a5305d27e84b54ddd Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@gmail.com>
Date: Thu, 12 Mar 2020 22:11:47 +0100
Subject: [PATCH 07/68] fix GPU memory monitoring

---
 examples/benchmarks.py         |  7 +++++++
 src/transformers/file_utils.py | 38 +++++++++++++++++++++++-----------
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 7a0800a09dbf..125dca4f99b1 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -463,6 +463,13 @@ def _compute_pytorch(
                                         for frame, _, _, cpu_gpu_mem in summary.cumulative[:6]
                                     )
                                 )
+                                print(
+                                    "\nLines with lowest memory consumption:\n"
+                                    + "\n".join(
+                                        f"=> {frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                                        for frame, _, _, cpu_gpu_mem in summary.cumulative[-6:]
+                                    )
+                                )
                                 print(f"\nTotal memory increase: {summary.total.string}")
                             dictionary[model_name]["memory"][batch_size][slice_size] = summary.total.bytes
                         else:
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 5fb995bb1c98..7108e0bdb025 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -521,6 +521,7 @@ def start_memory_tracing(modules_to_trace=None, modules_not_to_trace=None, event
                 if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
             - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
                 default to line
+            - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
 
         Return: Tuple with two lists which will be updated during tracing:
             - `used_memory_list` is a list of `UsedMemoryState` for each event (default each line of the traced script).
@@ -550,20 +551,18 @@ def start_memory_tracing(modules_to_trace=None, modules_not_to_trace=None, event
     try:
         from py3nvml import py3nvml
         py3nvml.nvmlInit()
+        devices = list(range(py3nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
+        py3nvml.nvmlShutdown()
     except ImportError:
         logger.warning("py3nvml not installed, we won't log GPU memory usage. "
                        "Install py3nvml (pip install py3nvml) to use GPU memory tracing.")
-        gpu_handle = None
+        log_gpu = False
     except:
         logger.warning("Error while initializing comunication with GPU. "
                        "We won't perform GPU memory tracing.")
-        gpu_handle = None
+        log_gpu = False
     else:
-        if gpus_to_trace is None:
-            deviceCount = py3nvml.nvmlDeviceGetCount()
-            gpu_handle = list(py3nvml.nvmlDeviceGetHandleByIndex(i) for i in range(deviceCount))
-        else:
-            gpu_handle = list(py3nvml.nvmlDeviceGetHandleByIndex(i) for i in range(gpus_to_trace))
+        log_gpu = _torch_available or _tf_available
 
     memory_trace = []
 
@@ -617,10 +616,21 @@ def traceit(frame, event, args):
             cpu_mem = mem.rss
 
         gpu_mem = 0
-        if gpu_handle is not None:
-            for handle in gpu_handle:
+        if log_gpu:
+            # Clear GPU caches
+            if _torch_available:
+                torch.cuda.empty_cache()
+            if _tf_available:
+                from tensorflow.python.eager import context
+                context.context()._clear_caches()  # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802
+
+            # Sum used memory for all GPUs
+            py3nvml.nvmlInit()
+            for i in devices:
+                handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
                 meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
                 gpu_mem += meminfo.used
+            py3nvml.nvmlShutdown()
 
         mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)
         memory_trace.append(mem_state)
@@ -640,11 +650,12 @@ def traceit(frame, event, args):
 MemorySummary = namedtuple("MemorySummary", ["sequential", "cumulative", "total"])
 
 
-def stop_memory_tracing(memory_trace=None):
+def stop_memory_tracing(memory_trace=None, ignore_released_memory_in_total=True):
     """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
 
         Args:
-            - memory_trace (optional, default: None): memmory trace to convert in summary
+            - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
+            - `ignore_released_memory_in_total` (boolean, default: None): if True we only sum memory increase to compute total memory
 
         Return:
             - None if `memory_trace` is None
@@ -693,7 +704,10 @@ def stop_memory_tracing(memory_trace=None):
             for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
         )
 
-        total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace)
+        if ignore_released_memory_in_total:
+            total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace)
+        else:
+            total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace)
         total_memory = Memory(bytes=total_memory, string=bytes_to_human_readable(total_memory))
         return MemorySummary(sequential=memory_diff_trace, cumulative=cumulative_memory, total=total_memory)
 

From 675dfa2b17a7a9cda0aec82c34be8a470becc8b4 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Thu, 12 Mar 2020 22:32:02 +0100
Subject: [PATCH 08/68] style and quality

---
 src/transformers/file_utils.py | 50 ++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 7108e0bdb025..bbf699eb851a 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -542,24 +542,28 @@ def start_memory_tracing(modules_to_trace=None, modules_not_to_trace=None, event
     try:
         import psutil
     except (ImportError):
-        logger.warning("Psutil not installed, we won't log CPU memory usage. "
-                       "Install psutil (pip install psutil) to use CPU memory tracing.")
+        logger.warning(
+            "Psutil not installed, we won't log CPU memory usage. "
+            "Install psutil (pip install psutil) to use CPU memory tracing."
+        )
         process = None
     else:
         process = psutil.Process(os.getpid())
 
     try:
         from py3nvml import py3nvml
+
         py3nvml.nvmlInit()
         devices = list(range(py3nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
         py3nvml.nvmlShutdown()
     except ImportError:
-        logger.warning("py3nvml not installed, we won't log GPU memory usage. "
-                       "Install py3nvml (pip install py3nvml) to use GPU memory tracing.")
+        logger.warning(
+            "py3nvml not installed, we won't log GPU memory usage. "
+            "Install py3nvml (pip install py3nvml) to use GPU memory tracing."
+        )
         log_gpu = False
-    except:
-        logger.warning("Error while initializing comunication with GPU. "
-                       "We won't perform GPU memory tracing.")
+    except (OSError, py3nvml.NVMLError):
+        logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
         log_gpu = False
     else:
         log_gpu = _torch_available or _tf_available
@@ -622,6 +626,7 @@ def traceit(frame, event, args):
                 torch.cuda.empty_cache()
             if _tf_available:
                 from tensorflow.python.eager import context
+
                 context.context()._clear_caches()  # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802
 
             # Sum used memory for all GPUs
@@ -644,6 +649,7 @@ def traceit(frame, event, args):
 
     return memory_trace
 
+
 Memory = namedtuple("Memory", ["bytes", "string"])
 CPUGPUMemory = namedtuple("TotalMemoryIncrease", ["cpu", "gpu", "cpu_gpu"])
 TraceCPUGPUMemory = namedtuple("TraceMemoryIncrease", ["frame", "cpu", "gpu", "cpu_gpu"])
@@ -680,27 +686,37 @@ def stop_memory_tracing(memory_trace=None, ignore_released_memory_in_total=True)
     if memory_trace is not None and len(memory_trace) > 1:
         memory_diff_trace = []
         cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
-        for (frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem) in zip(memory_trace[:-1], memory_trace[1:]):
+        for (frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem) in zip(
+            memory_trace[:-1], memory_trace[1:]
+        ):
             cpu_mem_inc = next_cpu_mem - cpu_mem
             cpu_mem_str = bytes_to_human_readable(cpu_mem_inc)
             gpu_mem_inc = next_gpu_mem - gpu_mem
             gpu_mem_str = bytes_to_human_readable(gpu_mem_inc)
             cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
             cpu_gpu_mem_str = bytes_to_human_readable(cpu_gpu_mem_inc)
-            memory_diff_trace.append(TraceCPUGPUMemory(frame=frame,
-                                                       cpu=Memory(cpu_mem_inc, cpu_mem_str),
-                                                       gpu=Memory(gpu_mem_inc, gpu_mem_str),
-                                                       cpu_gpu=Memory(cpu_gpu_mem_inc, cpu_gpu_mem_str)))
+            memory_diff_trace.append(
+                TraceCPUGPUMemory(
+                    frame=frame,
+                    cpu=Memory(cpu_mem_inc, cpu_mem_str),
+                    gpu=Memory(gpu_mem_inc, gpu_mem_str),
+                    cpu_gpu=Memory(cpu_gpu_mem_inc, cpu_gpu_mem_str),
+                )
+            )
             cumulative_memory_dict[frame][0] += cpu_mem_inc
             cumulative_memory_dict[frame][1] += gpu_mem_inc
             cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc
 
-        cumulative_memory = sorted(list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True)  # order by the total CPU + GPU memory increase
+        cumulative_memory = sorted(
+            list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True
+        )  # order by the total CPU + GPU memory increase
         cumulative_memory = list(
-            TraceCPUGPUMemory(frame=frame,
-                              cpu=Memory(cpu_mem_inc, bytes_to_human_readable(cpu_mem_inc)),
-                              gpu=Memory(gpu_mem_inc, bytes_to_human_readable(gpu_mem_inc)),
-                              cpu_gpu=Memory(cpu_gpu_mem_inc, bytes_to_human_readable(cpu_gpu_mem_inc)))
+            TraceCPUGPUMemory(
+                frame=frame,
+                cpu=Memory(cpu_mem_inc, bytes_to_human_readable(cpu_mem_inc)),
+                gpu=Memory(gpu_mem_inc, bytes_to_human_readable(gpu_mem_inc)),
+                cpu_gpu=Memory(cpu_gpu_mem_inc, bytes_to_human_readable(cpu_gpu_mem_inc)),
+            )
             for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
         )
 

From 8da965b46c34baaf1925d8a38571da4bb0f18bae Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Thu, 12 Mar 2020 22:48:41 +0100
Subject: [PATCH 09/68] clean up and doc

---
 examples/benchmarks.py         | 52 ++++++++++--------------------
 src/transformers/file_utils.py | 58 ++++++++++++++++++++--------------
 2 files changed, 51 insertions(+), 59 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 125dca4f99b1..4937699c4769 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -20,17 +20,13 @@
 
 import argparse
 import csv
-import os
 import timeit
 from time import time
 from typing import List
 
-import psutil
-
 from transformers import (
     AutoConfig,
     AutoTokenizer,
-    bytes_to_human_readable,
     is_tf_available,
     is_torch_available,
     start_memory_tracing,
@@ -259,21 +255,6 @@
 as they entered."""
 
 
-def get_memory_diff(func, memory_field="rss", repeat=1):
-    """ We include these inside the pytorch model butcan't do it for TF 2.0 so let's keep the method here for now.
-    """
-    process = psutil.Process(os.getpid())
-    mem_diffs = []
-    for _ in range(repeat):
-        mi_before = process.memory_info()
-        output = func()
-        mi_after = process.memory_info()
-        mem_diff = getattr(mi_after, memory_field) - getattr(mi_before, memory_field)
-        mem_diffs.append(mem_diff)
-    output = sum(mem_diffs) / len(mem_diffs)
-    return output
-
-
 def create_setup_and_compute(
     model_names: List[str],
     batch_sizes: List[int],
@@ -328,17 +309,13 @@ def create_setup_and_compute(
                 result = results[model_name]["results"][batch_size][slice_size]
                 memory = results[model_name]["memory"][batch_size][slice_size]
                 if isinstance(result, str):
-                    print(
-                        f"\t\t{model_name}/{batch_size}/{slice_size}: "
-                        f"{result} "
-                        f"{bytes_to_human_readable(memory)}"
-                    )
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result} " f"{memory}")
                 else:
                     print(
                         f"\t\t{model_name}/{batch_size}/{slice_size}: "
                         f"{(round(1000 * result) / 1000)}"
                         f"s "
-                        f"{bytes_to_human_readable(memory)}"
+                        f"{memory}"
                     )
 
     if save_to_csv:
@@ -444,9 +421,8 @@ def _compute_pytorch(
 
                             # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
                             trace = start_memory_tracing("transformers")
-                            output = inference(sequence)  # noqa: F841
+                            inference(sequence)
                             summary = stop_memory_tracing(trace)
-                            del output  # you should keep the output and delete it after otherwise garbage collector will free all :-)
 
                             if verbose:
                                 print(
@@ -471,7 +447,7 @@ def _compute_pytorch(
                                     )
                                 )
                                 print(f"\nTotal memory increase: {summary.total.string}")
-                            dictionary[model_name]["memory"][batch_size][slice_size] = summary.total.bytes
+                            dictionary[model_name]["memory"][batch_size][slice_size] = summary.total.string
                         else:
                             dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
 
@@ -531,27 +507,33 @@ def inference(inputs):
                         if not no_memory:
                             # Line by line memory tracing (all code in the module `transformers`) works for all models/arbitrary code
                             trace = start_memory_tracing("transformers")
-                            output = inference(sequence)  # noqa: F841
+                            inference(sequence)
                             summary = stop_memory_tracing(trace)
-                            del output  # you should keep the output and delete it after otherwise garbage collector will free all :-)
 
                             if verbose:
                                 print(
                                     "\nLines by line memory consumption:\n"
                                     + "\n".join(
-                                        f"{frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}"
-                                        for frame, _, mem_str in summary.sequential
+                                        f"{frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                                        for frame, _, _, cpu_gpu_mem in summary.sequential
                                     )
                                 )
                                 print(
                                     "\nLines with top memory consumption:\n"
                                     + "\n".join(
-                                        f"=> {frame.filename}:{frame.line_number}: mem {mem_str}: {frame.line_text}"
-                                        for frame, _, mem_str in summary.cumulative[:6]
+                                        f"=> {frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                                        for frame, _, _, cpu_gpu_mem in summary.cumulative[:6]
+                                    )
+                                )
+                                print(
+                                    "\nLines with lowest memory consumption:\n"
+                                    + "\n".join(
+                                        f"=> {frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                                        for frame, _, _, cpu_gpu_mem in summary.cumulative[-6:]
                                     )
                                 )
                                 print(f"\nTotal memory increase: {summary.total.string}")
-                            dictionary[model_name]["memory"][batch_size][slice_size] = summary.total.bytes
+                            dictionary[model_name]["memory"][batch_size][slice_size] = summary.total.string
                         else:
                             dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
 
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index bbf699eb851a..99dbe2519fc7 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -523,20 +523,20 @@ def start_memory_tracing(modules_to_trace=None, modules_not_to_trace=None, event
                 default to line
             - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
 
-        Return: Tuple with two lists which will be updated during tracing:
-            - `used_memory_list` is a list of `UsedMemoryState` for each event (default each line of the traced script).
+        Return:
+            - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
                 - `UsedMemoryState` are named tuples with the following fields:
                     - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
-                    - 'cpu_memory': RSS memory state *before* executing the line
-                    - 'gpu_memory': Used GPU memory *before* executing the line
+                    - 'cpu_memory': CPU RSS memory state *before* executing the line
+                    - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
 
-                `Frame` namedtuple used by `UsedMemoryState` to list the current frame state.
-                    The have the following fields:
-                    - 'filename' (string): Name of the file currently executed
-                    - 'module' (string): Name of the module currently executed
-                    - 'line_number' (int): Number of the line currently executed
-                    - 'event' (string): Event that triggered the tracing (default will be "line")
-                    - 'line_text' (string): Text of the line in the python script
+        `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
+            `Frame` has the following fields:
+            - 'filename' (string): Name of the file currently executed
+            - 'module' (string): Name of the module currently executed
+            - 'line_number' (int): Number of the line currently executed
+            - 'event' (string): Event that triggered the tracing (default will be "line")
+            - 'line_text' (string): Text of the line in the python script
 
     """
     try:
@@ -651,34 +651,44 @@ def traceit(frame, event, args):
 
 
 Memory = namedtuple("Memory", ["bytes", "string"])
-CPUGPUMemory = namedtuple("TotalMemoryIncrease", ["cpu", "gpu", "cpu_gpu"])
-TraceCPUGPUMemory = namedtuple("TraceMemoryIncrease", ["frame", "cpu", "gpu", "cpu_gpu"])
+MemoryState = namedtuple("MemoryState", ["frame", "cpu", "gpu", "cpu_gpu"])
 MemorySummary = namedtuple("MemorySummary", ["sequential", "cumulative", "total"])
 
 
-def stop_memory_tracing(memory_trace=None, ignore_released_memory_in_total=True):
+def stop_memory_tracing(memory_trace=None, ignore_released_memory=True):
     """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
 
         Args:
             - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
-            - `ignore_released_memory_in_total` (boolean, default: None): if True we only sum memory increase to compute total memory
+            - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
 
         Return:
             - None if `memory_trace` is None
             - `MemorySummary` namedtuple otherwise with the fields:
-                - `sequential`: the list of tuple (Frame, memory increase) computed from the memory_trace list
+                - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
                     by substracting the memory after executing each line from the memory before executing said line.
-                - `cumulative`: an OrderedDict of (Frame, cumulative memory increase for the line) with the cumulative increase in memory for each line
-                    (summing repeted memory increase for a line if it's executed several times).
-                    The dictionnary is ordered from the line with the largest memory consumption to the line with the smallest (can be negative if memory is free)
-                - `total`: total memory increase during the full tracing.
+                - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
+                    obtained by summing repeted memory increase for a line if it's executed several times.
+                    The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
+                - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
+                    Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
 
-        In the `MemorySummary`, frames are `Frame` namedtuple used to list the current frame state. A `Frame` has the following fields:
+        `Memory` named tuple have fields
+            - `byte` (integer): number of bytes,
+            - `string` (string): same as human readable string (ex: "3.5MB")
+
+        `Frame` are namedtuple used to list the current frame state and have the following fields:
             - 'filename' (string): Name of the file currently executed
             - 'module' (string): Name of the module currently executed
             - 'line_number' (int): Number of the line currently executed
             - 'event' (string): Event that triggered the tracing (default will be "line")
             - 'line_text' (string): Text of the line in the python script
+
+        `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
+            - `frame` (`Frame`): the current frame (see above)
+            - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
+            - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
+            - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
     """
     global _memory_tracing_enabled
     _memory_tracing_enabled = False
@@ -696,7 +706,7 @@ def stop_memory_tracing(memory_trace=None, ignore_released_memory_in_total=True)
             cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
             cpu_gpu_mem_str = bytes_to_human_readable(cpu_gpu_mem_inc)
             memory_diff_trace.append(
-                TraceCPUGPUMemory(
+                MemoryState(
                     frame=frame,
                     cpu=Memory(cpu_mem_inc, cpu_mem_str),
                     gpu=Memory(gpu_mem_inc, gpu_mem_str),
@@ -711,7 +721,7 @@ def stop_memory_tracing(memory_trace=None, ignore_released_memory_in_total=True)
             list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True
         )  # order by the total CPU + GPU memory increase
         cumulative_memory = list(
-            TraceCPUGPUMemory(
+            MemoryState(
                 frame=frame,
                 cpu=Memory(cpu_mem_inc, bytes_to_human_readable(cpu_mem_inc)),
                 gpu=Memory(gpu_mem_inc, bytes_to_human_readable(gpu_mem_inc)),
@@ -720,7 +730,7 @@ def stop_memory_tracing(memory_trace=None, ignore_released_memory_in_total=True)
             for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
         )
 
-        if ignore_released_memory_in_total:
+        if ignore_released_memory:
             total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace)
         else:
             total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace)

From 4cd18c190f2b335a87dcb2d4bb851fad68d630c8 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Fri, 13 Mar 2020 14:17:21 -0400
Subject: [PATCH 10/68] boom boom

---
 examples/benchmarks.py | 145 +++++++++++++++++++++++++++--------------
 1 file changed, 95 insertions(+), 50 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 4937699c4769..1b11640a7fa4 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -319,54 +319,58 @@ def create_setup_and_compute(
                     )
 
     if save_to_csv:
-        with open(csv_filename, mode="w") as csv_file, open(csv_memory_filename, mode="w") as csv_memory_file:
-            fieldnames = [
-                "model",
-                "1x8",
-                "1x64",
-                "1x128",
-                "1x256",
-                "1x512",
-                "1x1024",
-                "2x8",
-                "2x64",
-                "2x128",
-                "2x256",
-                "2x512",
-                "2x1024",
-                "4x8",
-                "4x64",
-                "4x128",
-                "4x256",
-                "4x512",
-                "4x1024",
-                "8x8",
-                "8x64",
-                "8x128",
-                "8x256",
-                "8x512",
-                "8x1024",
-            ]
-
-            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
-            writer.writeheader()
-            memory_writer = csv.DictWriter(csv_memory_file, fieldnames=fieldnames)
-            memory_writer.writeheader()
-
-            for model_name in model_names:
-                model_results = {
-                    f"{bs}x{ss}": results[model_name]["results"][bs][ss]
-                    for bs in results[model_name]["results"]
-                    for ss in results[model_name]["results"][bs]
-                }
-                writer.writerow({"model": model_name, **model_results})
-
-                model_memory_results = {
-                    f"{bs}x{ss}": results[model_name]["memory"][bs][ss]
-                    for bs in results[model_name]["memory"]
-                    for ss in results[model_name]["memory"][bs]
-                }
-                memory_writer.writerow({"model": model_name, **model_memory_results})
+        write_to_csv(csv_filename, csv_memory_filename, model_names, results)
+
+
+def write_to_csv(csv_filename, csv_memory_filename, model_names, results):
+    with open(csv_filename, mode="w") as csv_file, open(csv_memory_filename, mode="w") as csv_memory_file:
+        fieldnames = [
+            "model",
+            "1x8",
+            "1x64",
+            "1x128",
+            "1x256",
+            "1x512",
+            "1x1024",
+            "2x8",
+            "2x64",
+            "2x128",
+            "2x256",
+            "2x512",
+            "2x1024",
+            "4x8",
+            "4x64",
+            "4x128",
+            "4x256",
+            "4x512",
+            "4x1024",
+            "8x8",
+            "8x64",
+            "8x128",
+            "8x256",
+            "8x512",
+            "8x1024",
+        ]
+
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        writer.writeheader()
+        memory_writer = csv.DictWriter(csv_memory_file, fieldnames=fieldnames)
+        memory_writer.writeheader()
+
+        for model_name in model_names:
+            model_results = {
+                f"{bs}x{ss}": results[model_name]["results"][bs][ss]
+                for bs in results[model_name]["results"]
+                for ss in results[model_name]["results"][bs]
+            }
+            writer.writerow({"model": model_name, **model_results})
+
+            model_memory_results = {
+                f"{bs}x{ss}": results[model_name]["memory"][bs][ss]
+                for bs in results[model_name]["memory"]
+                for ss in results[model_name]["memory"][bs]
+            }
+            memory_writer.writerow({"model": model_name, **model_memory_results})
 
 
 def _compute_pytorch(
@@ -453,8 +457,8 @@ def _compute_pytorch(
 
                         if not no_speed:
                             print("Going through model with sequence of shape", sequence.shape)
-                            runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
-                            average_time = sum(runtimes) / float(len(runtimes)) / 3.0
+                            runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=1)
+                            average_time = sum(runtimes) / float(len(runtimes)) / 1.0
                             dictionary[model_name]["results"][batch_size][slice_size] = average_time
                         else:
                             dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
@@ -466,6 +470,47 @@ def _compute_pytorch(
                         dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
     return dictionary
 
+#def summarize_generate(model, input_ids):
+
+
+class MemoryViewer:
+
+    def __init__(self, summary):
+        self.summary = summary
+    @property
+    def total(self):
+        print(f"\nTotal memory increase: {self.summary.total.string}")
+
+    @property
+    def line_by_line(self):
+        print(
+            "\nLines by line memory consumption:\n"
+            + "\n".join(
+                f"{frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                for frame, _, _, cpu_gpu_mem in self.summary.sequential
+            )
+        )
+
+    def top_lines(self, n=6):
+        print(
+            "\nLines with top memory consumption:\n"
+            + "\n".join(
+                f"=> {frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                for frame, _, _, cpu_gpu_mem in self.summary.cumulative[:n]
+            )
+        )
+
+
+    def bottom_lines(self, n=6):
+        print(
+            "\nLines with top memory consumption:\n"
+            + "\n".join(
+                f"=> {frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                for frame, _, _, cpu_gpu_mem in self.summary.cumulative[-n:]
+            )
+        )
+
+
 
 def _compute_tensorflow(
     model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose

From f6d2c64aa7d0c19e9cf2eeae74d70c31698982d2 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sat, 14 Mar 2020 23:58:44 -0400
Subject: [PATCH 11/68] add test

---
 tests/test_modeling_bart.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 1ea46a091bcc..7d67078dfe5a 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -40,6 +40,7 @@
         LARGE_NEGATIVE,
     )
     from transformers.tokenization_bart import BartTokenizer
+    from transformers import start_memory_tracing, stop_memory_tracing
 
 
 @require_torch
@@ -291,6 +292,7 @@ def test_generate_beam_search(self):
         # TODO(SS): uneven length batches, empty inputs
 
     def test_shift_tokens_right(self):
+
         input_ids = torch.Tensor([[71, 82, 18, 33, 2, 1, 1], [68, 34, 26, 58, 30, 82, 2]]).long()
         shifted = shift_tokens_right(input_ids, 1)
         n_pad_before = input_ids.eq(1).float().sum()
@@ -316,7 +318,10 @@ def test_generate_fp16(self):
         config, input_ids, batch_size = self._get_config_and_data(output_past=True)
         attention_mask = input_ids.ne(1).to(torch_device)
         model = BartForConditionalGeneration(config).eval().to(torch_device).half()
+        trace = start_memory_tracing(modules_to_trace="transformers")
         model.generate(input_ids, attention_mask=attention_mask, do_sample=False, early_stopping=True)
+        summary = stop_memory_tracing(trace)
+        import ipdb; ipdb.set_trace()
 
     @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
     def test_base_model_fp16(self):
@@ -437,6 +442,7 @@ def test_cnn_summarization_same_as_fairseq_easy(self):
         text = " (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian"
         tokens = tok.encode(text, return_tensors="pt").to(torch_device)
         extra_len = 20
+
         gen_tokens = hf.generate(
             tokens,
             num_beams=4,

From 6849c29ceb5b14d06d96fdcd175a87fbc07b9281 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sat, 14 Mar 2020 23:59:58 -0400
Subject: [PATCH 12/68] viewer

---
 src/transformers/file_utils.py | 38 ++++++++++++++++++++++++++++++++++
 tests/test_modeling_bart.py    |  4 +++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 99dbe2519fc7..7d270aac8e65 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -748,3 +748,41 @@ def bytes_to_human_readable(memory_amount):
             return "{:.3f}{}".format(memory_amount, unit)
         memory_amount /= 1024.0
     return "{:.3f}TB".format(memory_amount)
+
+
+class MemoryViewer:
+
+    def __init__(self, summary):
+        self.summary = summary
+    @property
+    def total(self):
+        print(f"\nTotal memory increase: {self.summary.total.string}")
+
+    @property
+    def line_by_line(self):
+        print(
+            "\nLines by line memory consumption:\n"
+            + "\n".join(
+                f"{frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                for frame, _, _, cpu_gpu_mem in self.summary.sequential
+            )
+        )
+
+    def top_lines(self, n=6):
+        print(
+            "\nLines with top memory consumption:\n"
+            + "\n".join(
+                f"=> {frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                for frame, _, _, cpu_gpu_mem in self.summary.cumulative[:n]
+            )
+        )
+
+
+    def bottom_lines(self, n=6):
+        print(
+            "\nLines with top memory consumption:\n"
+            + "\n".join(
+                f"=> {frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
+                for frame, _, _, cpu_gpu_mem in self.summary.cumulative[-n:]
+            )
+        )
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 7d67078dfe5a..f063584781f7 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -24,6 +24,7 @@
 from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
+
 if is_torch_available():
     import torch
     from transformers import (
@@ -41,6 +42,7 @@
     )
     from transformers.tokenization_bart import BartTokenizer
     from transformers import start_memory_tracing, stop_memory_tracing
+    from transformers.file_utils import MemoryViewer
 
 
 @require_torch
@@ -320,7 +322,7 @@ def test_generate_fp16(self):
         model = BartForConditionalGeneration(config).eval().to(torch_device).half()
         trace = start_memory_tracing(modules_to_trace="transformers")
         model.generate(input_ids, attention_mask=attention_mask, do_sample=False, early_stopping=True)
-        summary = stop_memory_tracing(trace)
+        summary = MemoryViewer(stop_memory_tracing(trace))
         import ipdb; ipdb.set_trace()
 
     @unittest.skipIf(torch_device == "cpu", "Cant do half precision")

From 727e7543727e61cba7eb9e1128fe251ac5be7530 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 00:09:06 -0400
Subject: [PATCH 13/68] saver

---
 src/transformers/file_utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 7d270aac8e65..720a8df94cfd 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -760,7 +760,7 @@ def total(self):
 
     @property
     def line_by_line(self):
-        print(
+        return (
             "\nLines by line memory consumption:\n"
             + "\n".join(
                 f"{frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
@@ -768,6 +768,11 @@ def line_by_line(self):
             )
         )
 
+    def save_line_by_line(self, path):
+        res = self.line_by_line
+        with open(path, 'w') as f:
+            f.write(res)
+
     def top_lines(self, n=6):
         print(
             "\nLines with top memory consumption:\n"

From 562e6c50f969a95fa55a0d11ea65a40f2a28090f Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 00:28:27 -0400
Subject: [PATCH 14/68] add to mnli

---
 tests/test_modeling_bart.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index f063584781f7..4f31651a9925 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -287,6 +287,7 @@ def test_generate_beam_search(self):
         lm_model.eval()
 
         max_length = 5
+
         new_input_ids = lm_model.generate(
             input_ids.clone(), num_return_sequences=1, num_beams=2, no_repeat_ngram_size=3, max_length=max_length
         )
@@ -323,7 +324,9 @@ def test_generate_fp16(self):
         trace = start_memory_tracing(modules_to_trace="transformers")
         model.generate(input_ids, attention_mask=attention_mask, do_sample=False, early_stopping=True)
         summary = MemoryViewer(stop_memory_tracing(trace))
-        import ipdb; ipdb.set_trace()
+        summary.save_line_by_line('hf_mem.txt')
+
+
 
     @unittest.skipIf(torch_device == "cpu", "Cant do half precision")
     def test_base_model_fp16(self):
@@ -408,14 +411,15 @@ def test_mnli_inference(self):
 
         example_b = [0, 31414, 232, 328, 740, 1140, 69, 46078, 1588, 2, 1]
         input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2], example_b])
-
-        model = AutoModelForSequenceClassification.from_pretrained("bart-large-mnli").to(
-            torch_device
-        )  # eval called in from_pre
+        model = AutoModelForSequenceClassification.from_pretrained("bart-large-mnli").to(torch_device)
         inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
         # Test that model hasn't changed
+        trace = start_memory_tracing(modules_to_trace="transformers")
+
         with torch.no_grad():
             batched_logits, features = model.forward(**inputs_dict)
+        summary = MemoryViewer(stop_memory_tracing(trace))
+        summary.save_line_by_line('hf_mem.txt')
         expected_shape = torch.Size((2, 3))
         self.assertEqual(batched_logits.shape, expected_shape)
         expected_slice = torch.Tensor([[0.1907, 1.4342, -1.0289]]).to(torch_device)

From c9c0e74282ff61b4467015ad17e6b53b725c9e1d Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 00:28:55 -0400
Subject: [PATCH 15/68] different fnames

---
 tests/test_modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 4f31651a9925..1d0a192ef3a0 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -324,7 +324,7 @@ def test_generate_fp16(self):
         trace = start_memory_tracing(modules_to_trace="transformers")
         model.generate(input_ids, attention_mask=attention_mask, do_sample=False, early_stopping=True)
         summary = MemoryViewer(stop_memory_tracing(trace))
-        summary.save_line_by_line('hf_mem.txt')
+        summary.save_line_by_line('hf_mem_half_gen.txt')
 
 
 

From 8b7ae1ba6bb051bfc40621b6711d0d894aec7b06 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 11:29:48 -0400
Subject: [PATCH 16/68] use LoggingMixin

---
 src/transformers/file_utils.py    | 33 ++++++++++++++++++++++++++++++-
 src/transformers/modeling_bart.py | 17 ++++++++++++----
 tests/test_modeling_bart.py       | 13 ++++++++++++
 3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 720a8df94cfd..cf9de383eb94 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -30,6 +30,7 @@
 
 from . import __version__
 
+from durbango.logging_utils import LoggingMixin
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
@@ -650,6 +651,23 @@ def traceit(frame, event, args):
     return memory_trace
 
 
+from py3nvml import py3nvml
+
+
+def run_gpu_mem_counter():
+    # Sum used memory for all GPUs
+    if not torch.cuda.is_available(): return 0
+    py3nvml.nvmlInit()
+    devices = list(range(py3nvml.nvmlDeviceGetCount())) #if gpus_to_trace is None else gpus_to_trace
+    gpu_mem = 0
+    for i in devices:
+        handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
+        meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
+        gpu_mem += meminfo.used
+    py3nvml.nvmlShutdown()
+    return gpu_mem
+
+
 Memory = namedtuple("Memory", ["bytes", "string"])
 MemoryState = namedtuple("MemoryState", ["frame", "cpu", "gpu", "cpu_gpu"])
 MemorySummary = namedtuple("MemorySummary", ["sequential", "cumulative", "total"])
@@ -748,7 +766,20 @@ def bytes_to_human_readable(memory_amount):
             return "{:.3f}{}".format(memory_amount, unit)
         memory_amount /= 1024.0
     return "{:.3f}TB".format(memory_amount)
-
+import psutil
+import time
+
+def collect_log_data(msg=''):
+    process = psutil.Process(os.getpid())
+    cpu_mem = process.memory_info().rss
+    gpu_mem = run_gpu_mem_counter()
+    record = dict(cpu_mem=cpu_mem, gpu_mem=gpu_mem,
+         time = time.time(),
+         msg=msg)
+    long_msg = f'{msg}: GPU: {bytes_to_human_readable(gpu_mem)} CPU: {bytes_to_human_readable(gpu_mem)}'
+    record['long_msg'] = long_msg
+    print(long_msg)
+    return record
 
 class MemoryViewer:
 
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 408ca0f31dee..91348b502419 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -22,9 +22,9 @@
 from torch import Tensor, nn
 
 from .configuration_bart import BartConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .file_utils import add_start_docstrings, add_start_docstrings_to_callable, collect_log_data, bytes_to_human_readable
 from .modeling_utils import PreTrainedModel, create_position_ids_from_input_ids
-
+from durbango.logging_utils import LoggingMixin
 
 logger = logging.getLogger(__name__)
 
@@ -414,7 +414,7 @@ def forward(
         )  # just self_attn weights for now, following t5, layer_state = cache for decoding
 
 
-class BartDecoder(nn.Module):
+class BartDecoder(nn.Module, LoggingMixin):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer
     is a :class:`DecoderLayer`.
@@ -478,6 +478,7 @@ def forward(
 
         # embed positions
         positions = self.embed_positions(input_ids, generation_mode=self.generation_mode)
+        self.log_mem('decoder: embedded positions')
 
         if self.generation_mode:
             input_ids = input_ids[:, -1:]
@@ -485,6 +486,7 @@ def forward(
             assert input_ids.ne(self.padding_idx).any()
 
         x = self.embed_tokens(input_ids)
+        self.log_mem('decoder: embedded tokens')
         x += positions
 
         x = self.layernorm_embedding(x)
@@ -511,6 +513,7 @@ def forward(
                 attention_mask=combined_mask,
                 need_attn_weights=self.output_attentions,
             )
+            self.log_mem(f'decoder: called attn {i}')
 
             if self.output_past:
                 next_decoder_cache.append(layer_past.copy())
@@ -808,11 +811,13 @@ def _filter_out_falsey_values(tup) -> Tuple:
 
 # Public API
 
+import time
+import pandas as pd
 
 @add_start_docstrings(
     "The bare BART Model outputting raw hidden-states without any specific head on top.", BART_START_DOCSTRING,
 )
-class BartModel(PretrainedBartModel):
+class BartModel(PretrainedBartModel, LoggingMixin):
     def __init__(self, config: BartConfig):
         super().__init__(config)
         self.output_attentions = config.output_attentions
@@ -824,6 +829,7 @@ def __init__(self, config: BartConfig):
         self.encoder = BartEncoder(config, self.shared)
         self.decoder = BartDecoder(config, self.shared)
 
+
         self.init_weights()
 
     @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
@@ -944,6 +950,7 @@ def forward(
             tokenizer.decode(predictions).split()
             # ['good', 'great', 'all', 'really', 'very']
         """
+        self.model.log_mem('before BartModel.forward')
         outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
@@ -952,7 +959,9 @@ def forward(
             decoder_attention_mask=decoder_attention_mask,
             decoder_cached_states=decoder_cached_states,
         )
+        self.model.log_mem('after call, before lm_head')
         lm_logits = self.lm_head(outputs[0])
+        self.model.log_mem('after lm_head')
         outputs = (lm_logits,) + outputs[1:]  # Add hidden states and attention if they are here
         if lm_labels is not None:
             loss_fct = nn.CrossEntropyLoss()
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 1d0a192ef3a0..b6142b0e25c8 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -441,9 +441,22 @@ def test_model_from_pretrained(self):
             model = BartModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
 
+    @slow
+    def test_compare_generation_mem(self):
+        hf = BartForConditionalGeneration.from_pretrained("bart-large-cnn", output_past=True,).to(torch_device)
+        if torch_device == 'cuda':
+            hf = hf.half()
+        tok = BartTokenizer.from_pretrained("bart-large")
+        text = " (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian"
+        tokens = tok.encode(text, return_tensors="pt").to(torch_device)
+
+
+
     @slow
     def test_cnn_summarization_same_as_fairseq_easy(self):
         hf = BartForConditionalGeneration.from_pretrained("bart-large-cnn", output_past=True,).to(torch_device)
+        if torch_device == 'cuda':
+            hf = hf.half()
         tok = BartTokenizer.from_pretrained("bart-large")
         text = " (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian"
         tokens = tok.encode(text, return_tensors="pt").to(torch_device)

From a229a277342ed6225815bfe9e7ea98b1a8f693ee Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 11:53:37 -0400
Subject: [PATCH 17/68] more logigng

---
 src/transformers/modeling_bart.py | 21 +++++++++++++++------
 tests/test_modeling_bart.py       |  1 +
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 91348b502419..994971da53c7 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -107,7 +107,7 @@ def _prepare_bart_decoder_inputs(
     return decoder_input_ids, decoder_attn_mask
 
 
-class PretrainedBartModel(PreTrainedModel):
+class PretrainedBartModel(PreTrainedModel, LoggingMixin):
     config_class = BartConfig
     base_model_prefix = "model"
     pretrained_model_archive_map = BART_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -240,7 +240,7 @@ def forward(self, x, encoder_padding_mask):
         return x, attn_weights
 
 
-class BartEncoder(nn.Module):
+class BartEncoder(nn.Module, LoggingMixin):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer
     is a :class:`EncoderLayer`.
@@ -304,7 +304,7 @@ def forward(
         encoder_states, all_attentions = [], []
 
         # encoder layers
-        for encoder_layer in self.layers:
+        for i, encoder_layer in enumerate(self.layers):
 
             if self.output_hidden_states:
                 encoder_states.append(x)
@@ -314,6 +314,7 @@ def forward(
                 attn = None
             else:
                 x, attn = encoder_layer(x, attention_mask)
+            self.log_mem('encoder: called layer {}')
 
             if self.output_attentions:
                 all_attentions.append(attn)
@@ -544,7 +545,7 @@ def reorder_attn_buffer(input_buffer, new_order):
     return input_buffer
 
 
-class SelfAttention(nn.Module):
+class SelfAttention(nn.Module, LoggingMixin):
     """Multi-headed attention from "Attention Is All You Need"""
 
     def __init__(
@@ -583,6 +584,10 @@ def __init__(
     def _shape(self, tensor, dim_0, bsz):
         return tensor.contiguous().view(dim_0, bsz * self.num_heads, self.head_dim).transpose(0, 1)
 
+
+    def log_mem(self, msg='', verbose=False):
+        super().log_mem(msg=f'{self.cache_key}_attn:{msg}', verbose=verbose)
+
     def forward(
         self,
         query,
@@ -623,6 +628,7 @@ def forward(
             layer_state = {}
 
         q = self.q_proj(query) * self.scaling
+        self.log_mem('\tq_proj')
         if self.encoder_decoder_attention:
             if key is None:
                 assert value is None
@@ -635,12 +641,15 @@ def forward(
             v = self.v_proj(query)
 
         q = self._shape(q, tgt_len, bsz)
+        self.log_mem('\tq_reshape')
         if k is not None:
             k = self._shape(k, -1, bsz)
         if v is not None:
             v = self._shape(v, -1, bsz)
+            self.log_mem('\t done reshaping k,v')
 
         if saved_state is not None:
+            self.log_mem('\t about to use saved_state')
             k, v, key_padding_mask = self._use_saved_state(k, v, saved_state, key_padding_mask, static_kv, bsz)
         # assert self.cache_key != 'encoder_decoder' or key_padding_mask is None
 
@@ -739,7 +748,7 @@ def _cat_prev_key_padding_mask(
         return new_key_padding_mask
 
 
-class BartClassificationHead(nn.Module):
+class BartClassificationHead(nn.Module, LoggingMixin):
     """Head for sentence-level classification tasks."""
 
     # This can trivially be shared with RobertaClassificationHead
@@ -817,7 +826,7 @@ def _filter_out_falsey_values(tup) -> Tuple:
 @add_start_docstrings(
     "The bare BART Model outputting raw hidden-states without any specific head on top.", BART_START_DOCSTRING,
 )
-class BartModel(PretrainedBartModel, LoggingMixin):
+class BartModel(PretrainedBartModel):
     def __init__(self, config: BartConfig):
         super().__init__(config)
         self.output_attentions = config.output_attentions
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index b6142b0e25c8..b69e26f2648e 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -246,6 +246,7 @@ def test_lm_forward(self):
         expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
         self.assertEqual(logits.shape, expected_shape)
         self.assertIsInstance(loss.item(), float)
+        log_df = lm_model.combine_logs()
 
     def test_lm_uneven_forward(self):
         config = BartConfig(

From bc076020a86b37fac0b5de231c68cbd46cea36ae Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 12:59:24 -0400
Subject: [PATCH 18/68] add preinit

---
 src/transformers/modeling_bart.py | 8 ++++----
 tests/test_modeling_bart.py       | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 994971da53c7..b501dd743763 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -327,7 +327,7 @@ def forward(
         return x, encoder_states, all_attentions
 
 
-class DecoderLayer(nn.Module):
+class DecoderLayer(nn.Module, LoggingMixin):
     def __init__(self, config: BartConfig):
         super().__init__()
         self.embed_dim = config.d_model
@@ -898,9 +898,9 @@ class BartForConditionalGeneration(PretrainedBartModel):
 
     def __init__(self, config: BartConfig):
         super().__init__(config)
-        # if base_model is None:
-        base_model = BartModel(config)
-        self.model = base_model
+        # if base_model is Nones:
+        self.log_mem('pre-init')
+        self.model = BartModel(config)
         self.lm_head = _make_linear_from_emb(self.model.shared)
 
     def tie_weights(self):
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index b69e26f2648e..3a8599720df6 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -246,7 +246,7 @@ def test_lm_forward(self):
         expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
         self.assertEqual(logits.shape, expected_shape)
         self.assertIsInstance(loss.item(), float)
-        log_df = lm_model.combine_logs()
+
 
     def test_lm_uneven_forward(self):
         config = BartConfig(

From 09a68941d5b9b4ae2eab76fdb6dca6bb73e0e878 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 13:19:25 -0400
Subject: [PATCH 19/68] dont log preinit

---
 src/transformers/modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index b501dd743763..a09950254d9c 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -899,7 +899,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
     def __init__(self, config: BartConfig):
         super().__init__(config)
         # if base_model is Nones:
-        self.log_mem('pre-init')
+        #self.log_mem('pre-init')
         self.model = BartModel(config)
         self.lm_head = _make_linear_from_emb(self.model.shared)
 

From 75381a478a145055ebccfd57e74609d492352d0a Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 14:40:27 -0400
Subject: [PATCH 20/68] only sometimes update layer state

---
 src/transformers/modeling_bart.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index a09950254d9c..21ef0efaebc1 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -225,6 +225,7 @@ def forward(self, x, encoder_padding_mask):
         residual = x
         x, attn_weights = self.self_attn(
             query=x, key=x, value=x, key_padding_mask=encoder_padding_mask, need_weights=self.output_attentions,
+            update_layer_state=False,
         )
         x = F.dropout(x, p=self.dropout, training=self.training)
         x = residual + x
@@ -595,6 +596,7 @@ def forward(
         value: Optional[Tensor],
         key_padding_mask: Optional[Tensor] = None,
         layer_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        update_layer_state=True,
         need_weights: bool = False,
         static_kv: bool = False,
         attn_mask: Optional[Tensor] = None,
@@ -654,11 +656,12 @@ def forward(
         # assert self.cache_key != 'encoder_decoder' or key_padding_mask is None
 
         # Update cache
-        layer_state[self.cache_key] = {
-            "prev_key": k.view(bsz, self.num_heads, -1, self.head_dim),
-            "prev_value": v.view(bsz, self.num_heads, -1, self.head_dim),
-            "prev_key_padding_mask": key_padding_mask if not static_kv else None,
-        }
+        if update_layer_state:
+            layer_state[self.cache_key] = {
+                "prev_key": k.view(bsz, self.num_heads, -1, self.head_dim),
+                "prev_value": v.view(bsz, self.num_heads, -1, self.head_dim),
+                "prev_key_padding_mask": key_padding_mask if not static_kv else None,
+            }
 
         assert k is not None
         src_len = k.size(1)

From cc0363c5adc214275a7fc3854fe5664a67c0dfbb Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 14:48:00 -0400
Subject: [PATCH 21/68] Script

---
 src/transformers/bart_mem_prof.py | 51 +++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 src/transformers/bart_mem_prof.py

diff --git a/src/transformers/bart_mem_prof.py b/src/transformers/bart_mem_prof.py
new file mode 100644
index 000000000000..fb405e6c93b4
--- /dev/null
+++ b/src/transformers/bart_mem_prof.py
@@ -0,0 +1,51 @@
+from transformers import *
+import torch
+DEFAULT_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE):
+
+    tokenizer = BartTokenizer.from_pretrained('bart-large')
+    lns = [" " + x.rstrip() for x in open(source_path).readlines()][:batch_size]
+    model = BartForConditionalGeneration.from_pretrained('bart-large-cnn', output_past=True).to(DEFAULT_DEVICE)
+    dct = tokenizer.batch_encode_plus(lns, max_length=1024, return_tensors="pt", pad_to_max_length=True)
+    ids = dct['input_ids'].to(DEFAULT_DEVICE)
+    msk = dct['attention_mask'].to(DEFAULT_DEVICE)
+    model.log_mem('starting')
+    summaries = model.generate(
+        input_ids=ids,
+        attention_mask=msk,
+        num_beams=4,
+        length_penalty=2.0,
+        max_length=140 + 2,  # +2 from original because we start at step=1 and stop before max_length
+        min_length=55 + 1,  # +1 from original because we start at step=1
+        no_repeat_ngram_size=3,
+        early_stopping=True,
+        do_sample=False,
+        decoder_start_token_id=model.config.eos_token_ids[0],
+    )
+    model.log_mem('done')
+    dec = [tokenizer.decode(s) for s in summaries]
+    log_df = model.combine_logs()
+    log_df.to_csv(out_file)
+    print(dec[0])
+
+import argparse
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "source_path", type=str, default="/home/shleifer/transformers_fork/notebooks/test.source",
+        help="like cnn_dm/test.source",
+    )
+    parser.add_argument(
+        "output_path", type=str, help="where to save summaries",
+    )
+    parser.add_argument(
+        "--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.",
+    )
+    parser.add_argument(
+        "--bs", type=int, default=8, required=False, help="batch size: how many to summarize at a time",
+    )
+    args = parser.parse_args()
+    runner(args.source_path, args.output_path, batch_size=args.bs, device=args.device)
+
+
+

From 480c8c6518b1e749279f2711f5633e06eb64d310 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 14:49:59 -0400
Subject: [PATCH 22/68] default

---
 src/transformers/bart_mem_prof.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/bart_mem_prof.py b/src/transformers/bart_mem_prof.py
index fb405e6c93b4..47599f646081 100644
--- a/src/transformers/bart_mem_prof.py
+++ b/src/transformers/bart_mem_prof.py
@@ -31,12 +31,13 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE):
 import argparse
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
+
     parser.add_argument(
-        "source_path", type=str, default="/home/shleifer/transformers_fork/notebooks/test.source",
-        help="like cnn_dm/test.source",
+        "output_path", type=str, help="where to save summaries",
     )
     parser.add_argument(
-        "output_path", type=str, help="where to save summaries",
+        "--source_path", type=str, default="/home/shleifer/transformers_fork/notebooks/test.source",
+        help="like cnn_dm/test.source", required=False
     )
     parser.add_argument(
         "--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.",

From 3da23d47c8de7502e417b15b252f871463c46c80 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 15:17:56 -0400
Subject: [PATCH 23/68] Do generate flag

---
 src/transformers/bart_mem_prof.py | 43 +++++++++++++++++++------------
 src/transformers/modeling_bart.py | 10 +++----
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/src/transformers/bart_mem_prof.py b/src/transformers/bart_mem_prof.py
index 47599f646081..81158831b239 100644
--- a/src/transformers/bart_mem_prof.py
+++ b/src/transformers/bart_mem_prof.py
@@ -1,7 +1,7 @@
 from transformers import *
 import torch
 DEFAULT_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
-def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE):
+def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE, prof_generate=False):
 
     tokenizer = BartTokenizer.from_pretrained('bart-large')
     lns = [" " + x.rstrip() for x in open(source_path).readlines()][:batch_size]
@@ -10,20 +10,28 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE):
     ids = dct['input_ids'].to(DEFAULT_DEVICE)
     msk = dct['attention_mask'].to(DEFAULT_DEVICE)
     model.log_mem('starting')
-    summaries = model.generate(
-        input_ids=ids,
-        attention_mask=msk,
-        num_beams=4,
-        length_penalty=2.0,
-        max_length=140 + 2,  # +2 from original because we start at step=1 and stop before max_length
-        min_length=55 + 1,  # +1 from original because we start at step=1
-        no_repeat_ngram_size=3,
-        early_stopping=True,
-        do_sample=False,
-        decoder_start_token_id=model.config.eos_token_ids[0],
-    )
-    model.log_mem('done')
-    dec = [tokenizer.decode(s) for s in summaries]
+    if prof_generate:
+        summaries = model.generate(
+            input_ids=ids,
+            attention_mask=msk,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=140 + 2,  # +2 from original because we start at step=1 and stop before max_length
+            min_length=55 + 1,  # +1 from original because we start at step=1
+            no_repeat_ngram_size=3,
+            early_stopping=True,
+            do_sample=False,
+            decoder_start_token_id=model.config.eos_token_ids[0],
+        )
+        model.log_mem('done')
+        dec = [tokenizer.decode(s) for s in summaries]
+    else:
+        #model.decoder.generation_mode = False
+        model(
+            input_ids=ids,
+            attention_mask=msk,
+        )
+
     log_df = model.combine_logs()
     log_df.to_csv(out_file)
     print(dec[0])
@@ -45,8 +53,11 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE):
     parser.add_argument(
         "--bs", type=int, default=8, required=False, help="batch size: how many to summarize at a time",
     )
+    parser.add_argument(
+        "--do-generate", action='store_true', required=False, help="batch size: how many to summarize at a time",
+    )
     args = parser.parse_args()
-    runner(args.source_path, args.output_path, batch_size=args.bs, device=args.device)
+    runner(args.source_path, args.output_path, batch_size=args.bs, device=args.device, prof_generate=args.do_generate)
 
 
 
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 21ef0efaebc1..f04b56fa3ea0 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -984,17 +984,17 @@ def forward(
         return outputs
 
     def prepare_inputs_for_generation(self, decoder_input_ids, past, encoder_inputs, attention_mask):
-        assert attention_mask.shape == encoder_inputs.shape, "attn_mask.shape != encoder_input.shape: {} =! {}".format(
-            attention_mask.shape, encoder_inputs.shape
-        )
+        # assert attention_mask.shape == encoder_inputs.shape, "attn_mask.shape != encoder_input.shape: {} =! {}".format(
+        #     attention_mask.shape, encoder_inputs.shape
+        # )
         if past is None:  # first step
             encoder_outputs, decoder_cached_states = None, None
         else:
             encoder_outputs, decoder_cached_states = past
 
-        input_ids = encoder_inputs
+        input_ids = encoder_inputs # FIXME(SS): dont need this
         return {
-            "input_ids": input_ids,  # ignored after first pass
+            "input_ids": encoder_inputs,  # ignored after first pass
             "encoder_outputs": encoder_outputs,
             "decoder_cached_states": decoder_cached_states,
             "decoder_input_ids": decoder_input_ids,

From a6a592d2b1127696100434e5397689710682d90d Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 15:21:48 -0400
Subject: [PATCH 24/68] no output_past

---
 src/transformers/bart_mem_prof.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/bart_mem_prof.py b/src/transformers/bart_mem_prof.py
index 81158831b239..dfdd22b47817 100644
--- a/src/transformers/bart_mem_prof.py
+++ b/src/transformers/bart_mem_prof.py
@@ -5,12 +5,13 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE, prof_gene
 
     tokenizer = BartTokenizer.from_pretrained('bart-large')
     lns = [" " + x.rstrip() for x in open(source_path).readlines()][:batch_size]
-    model = BartForConditionalGeneration.from_pretrained('bart-large-cnn', output_past=True).to(DEFAULT_DEVICE)
+
     dct = tokenizer.batch_encode_plus(lns, max_length=1024, return_tensors="pt", pad_to_max_length=True)
     ids = dct['input_ids'].to(DEFAULT_DEVICE)
     msk = dct['attention_mask'].to(DEFAULT_DEVICE)
     model.log_mem('starting')
     if prof_generate:
+        model = BartForConditionalGeneration.from_pretrained('bart-large-cnn', output_past=True).to(DEFAULT_DEVICE)
         summaries = model.generate(
             input_ids=ids,
             attention_mask=msk,
@@ -26,6 +27,7 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE, prof_gene
         model.log_mem('done')
         dec = [tokenizer.decode(s) for s in summaries]
     else:
+        model = BartForConditionalGeneration.from_pretrained('bart-large-cnn', output_past=True).to(DEFAULT_DEVICE)
         #model.decoder.generation_mode = False
         model(
             input_ids=ids,

From 681e0a3581dfab1682dbea24538e5b26cacdaacc Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 15:23:11 -0400
Subject: [PATCH 25/68] fix

---
 src/transformers/bart_mem_prof.py | 3 +--
 src/transformers/modeling_bart.py | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/transformers/bart_mem_prof.py b/src/transformers/bart_mem_prof.py
index dfdd22b47817..9e3774b6d7dc 100644
--- a/src/transformers/bart_mem_prof.py
+++ b/src/transformers/bart_mem_prof.py
@@ -9,9 +9,9 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE, prof_gene
     dct = tokenizer.batch_encode_plus(lns, max_length=1024, return_tensors="pt", pad_to_max_length=True)
     ids = dct['input_ids'].to(DEFAULT_DEVICE)
     msk = dct['attention_mask'].to(DEFAULT_DEVICE)
+    model = BartForConditionalGeneration.from_pretrained('bart-large-cnn', output_past=prof_generate).to(DEFAULT_DEVICE)
     model.log_mem('starting')
     if prof_generate:
-        model = BartForConditionalGeneration.from_pretrained('bart-large-cnn', output_past=True).to(DEFAULT_DEVICE)
         summaries = model.generate(
             input_ids=ids,
             attention_mask=msk,
@@ -27,7 +27,6 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE, prof_gene
         model.log_mem('done')
         dec = [tokenizer.decode(s) for s in summaries]
     else:
-        model = BartForConditionalGeneration.from_pretrained('bart-large-cnn', output_past=True).to(DEFAULT_DEVICE)
         #model.decoder.generation_mode = False
         model(
             input_ids=ids,
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index f04b56fa3ea0..08989874955b 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -992,9 +992,9 @@ def prepare_inputs_for_generation(self, decoder_input_ids, past, encoder_inputs,
         else:
             encoder_outputs, decoder_cached_states = past
 
-        input_ids = encoder_inputs # FIXME(SS): dont need this
+        input_ids = encoder_inputs
         return {
-            "input_ids": encoder_inputs,  # ignored after first pass
+            "input_ids": input_ids,  # ignored after first pass
             "encoder_outputs": encoder_outputs,
             "decoder_cached_states": decoder_cached_states,
             "decoder_input_ids": decoder_input_ids,

From 8b46f83aeb55d43db353755a510dfd5b28c6ad7c Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 15:26:10 -0400
Subject: [PATCH 26/68] no grad

---
 src/transformers/bart_mem_prof.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/transformers/bart_mem_prof.py b/src/transformers/bart_mem_prof.py
index 9e3774b6d7dc..ddf0f1077849 100644
--- a/src/transformers/bart_mem_prof.py
+++ b/src/transformers/bart_mem_prof.py
@@ -27,11 +27,12 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE, prof_gene
         model.log_mem('done')
         dec = [tokenizer.decode(s) for s in summaries]
     else:
-        #model.decoder.generation_mode = False
-        model(
-            input_ids=ids,
-            attention_mask=msk,
-        )
+        #model.decoder.generation_mode = Fals
+        with torch.no_grad():
+            model(
+                input_ids=ids,
+                attention_mask=msk,
+            )
 
     log_df = model.combine_logs()
     log_df.to_csv(out_file)

From 7d669b8797e7bc16377c92dfefcd6f461cca7f4c Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 15:59:57 -0400
Subject: [PATCH 27/68] no lm_head

---
 src/transformers/bart_mem_prof.py | 3 ++-
 src/transformers/modeling_bart.py | 8 ++++----
 tests/test_modeling_bart.py       | 6 +++++-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/transformers/bart_mem_prof.py b/src/transformers/bart_mem_prof.py
index ddf0f1077849..5ef9609d4da0 100644
--- a/src/transformers/bart_mem_prof.py
+++ b/src/transformers/bart_mem_prof.py
@@ -26,6 +26,7 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE, prof_gene
         )
         model.log_mem('done')
         dec = [tokenizer.decode(s) for s in summaries]
+        print(dec[0])
     else:
         #model.decoder.generation_mode = Fals
         with torch.no_grad():
@@ -36,7 +37,7 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE, prof_gene
 
     log_df = model.combine_logs()
     log_df.to_csv(out_file)
-    print(dec[0])
+
 
 import argparse
 if __name__ == '__main__':
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 08989874955b..9641acfa54ef 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -904,7 +904,7 @@ def __init__(self, config: BartConfig):
         # if base_model is Nones:
         #self.log_mem('pre-init')
         self.model = BartModel(config)
-        self.lm_head = _make_linear_from_emb(self.model.shared)
+        #self.lm_head = _make_linear_from_emb(self.model.shared)
 
     def tie_weights(self):
         pass  # hack to prevent changing lm_head.out_features. The input and output embeddings are still the same.
@@ -972,7 +972,8 @@ def forward(
             decoder_cached_states=decoder_cached_states,
         )
         self.model.log_mem('after call, before lm_head')
-        lm_logits = self.lm_head(outputs[0])
+        lm_logits = F.linear(outputs[0], self.model.shared.weight)
+        #lm_logits = self.lm_head(outputs[0])
         self.model.log_mem('after lm_head')
         outputs = (lm_logits,) + outputs[1:]  # Add hidden states and attention if they are here
         if lm_labels is not None:
@@ -1025,9 +1026,8 @@ def _reorder_cache(past, beam_idx):
 
         past = ((new_enc_out, new_enc_mask), reordered_past)
         return past
-
     def get_output_embeddings(self):
-        return self.lm_head
+        return _make_linear_from_emb(self.shared)  # make it on the fly
 
 
 @add_start_docstrings(
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 3a8599720df6..1bd2bd121587 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -261,9 +261,13 @@ def test_lm_uneven_forward(self):
             max_position_embeddings=48,
         )
         lm_model = BartForConditionalGeneration(config).to(torch_device)
+        lm_model.log_mem('starting')
         context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device)
         summary = torch.Tensor([[82, 71, 82, 18, 2], [58, 68, 2, 1, 1]]).long().to(torch_device)
-        loss, logits, enc_features = lm_model.forward(input_ids=context, decoder_input_ids=summary, lm_labels=summary)
+        loss, logits, enc_features = lm_model(input_ids=context, decoder_input_ids=summary, lm_labels=summary)
+        log_df = lm_model.combine_logs()
+        tot = log_df.cpu_mem.max()-log_df.cpu_mem.min()
+        self.assertGreaterEqual(tot/1024**2, 3, )
         expected_shape = (*summary.shape, config.vocab_size)
         self.assertEqual(logits.shape, expected_shape)
 

From 237202f3920c89ea4a1515366a35414a4181ab1a Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 16:03:41 -0400
Subject: [PATCH 28/68] get it off gpu

---
 src/transformers/bart_mem_prof.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/bart_mem_prof.py b/src/transformers/bart_mem_prof.py
index 5ef9609d4da0..a84c0f360f81 100644
--- a/src/transformers/bart_mem_prof.py
+++ b/src/transformers/bart_mem_prof.py
@@ -11,7 +11,9 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE, prof_gene
     msk = dct['attention_mask'].to(DEFAULT_DEVICE)
     model = BartForConditionalGeneration.from_pretrained('bart-large-cnn', output_past=prof_generate).to(DEFAULT_DEVICE)
     model.log_mem('starting')
+    model.lm_head.cpu()
     if prof_generate:
+
         summaries = model.generate(
             input_ids=ids,
             attention_mask=msk,

From 3bb13cf72b0d7eef913f377677043283555529ce Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Sun, 15 Mar 2020 16:06:25 -0400
Subject: [PATCH 29/68] del

---
 src/transformers/bart_mem_prof.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/bart_mem_prof.py b/src/transformers/bart_mem_prof.py
index a84c0f360f81..f3523f05d770 100644
--- a/src/transformers/bart_mem_prof.py
+++ b/src/transformers/bart_mem_prof.py
@@ -11,7 +11,6 @@ def runner(source_path, out_file, batch_size=8, device=DEFAULT_DEVICE, prof_gene
     msk = dct['attention_mask'].to(DEFAULT_DEVICE)
     model = BartForConditionalGeneration.from_pretrained('bart-large-cnn', output_past=prof_generate).to(DEFAULT_DEVICE)
     model.log_mem('starting')
-    model.lm_head.cpu()
     if prof_generate:
 
         summaries = model.generate(

From 8381c9b9b0499c5f134e66dc47bb87604520f107 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Tue, 17 Mar 2020 18:37:47 -0400
Subject: [PATCH 30/68] Fix

---
 src/transformers/modeling_bart.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 9641acfa54ef..0b4967626494 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -315,7 +315,7 @@ def forward(
                 attn = None
             else:
                 x, attn = encoder_layer(x, attention_mask)
-            self.log_mem('encoder: called layer {}')
+            self.log_mem('encoder: called layer {i}')
 
             if self.output_attentions:
                 all_attentions.append(attn)
@@ -537,9 +537,7 @@ def forward(
 
 def reorder_attn_buffer(input_buffer, new_order):
     """Reorder buffered internal state (for incremental generation)."""
-    # input_buffer = self._get_input_buffer(incremental_state)
-    for k in input_buffer.keys():
-        input_buffer_k = input_buffer[k]
+    for k, input_buffer_k in input_buffer.items():
         if input_buffer_k is not None:
             input_buffer[k] = input_buffer_k.index_select(0, new_order)
         # incremental_state = self._set_input_buffer(incremental_state, input_buffer)

From 1c5fe4e4186d7f8f7773f316ff87e1e09def1486 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Tue, 17 Mar 2020 22:56:32 -0400
Subject: [PATCH 31/68] new padding strategy

---
 src/transformers/modeling_bart.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 0b4967626494..35b5adb58d0b 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -678,8 +678,9 @@ def forward(
 
         if key_padding_mask is not None:  # don't attend to padding symbols
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool)
-            attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
+            attn_weights = attn_weights + key_padding_mask.unsqueeze(1).unsqueeze(2)
+            #reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2)#.to(torch.bool)
+            #attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = F.softmax(attn_weights, dim=-1)
         attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training,)

From c9d5c6363546e00a976d72f74f650cc231acee60 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Tue, 17 Mar 2020 23:04:10 -0400
Subject: [PATCH 32/68] bugfix

---
 src/transformers/modeling_bart.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 35b5adb58d0b..069bc4582e0b 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -676,7 +676,7 @@ def forward(
             key_padding_mask = None
         assert key_padding_mask is None or key_padding_mask.size()[:2] == (bsz, src_len,)
 
-        if key_padding_mask is not None:  # don't attend to padding symbols
+        if key_padding_mask is not None:  # shape (bsz, src_len)
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             attn_weights = attn_weights + key_padding_mask.unsqueeze(1).unsqueeze(2)
             #reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2)#.to(torch.bool)
@@ -1025,8 +1025,9 @@ def _reorder_cache(past, beam_idx):
 
         past = ((new_enc_out, new_enc_mask), reordered_past)
         return past
+
     def get_output_embeddings(self):
-        return _make_linear_from_emb(self.shared)  # make it on the fly
+        return _make_linear_from_emb(self.model.shared)  # make it on the fly
 
 
 @add_start_docstrings(

From dffc461ed0ce397f9956ecc9a98b5b6ed010f143 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Tue, 17 Mar 2020 23:19:51 -0400
Subject: [PATCH 33/68] undo chg

---
 src/transformers/modeling_bart.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 069bc4582e0b..d654c3854d37 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -678,9 +678,8 @@ def forward(
 
         if key_padding_mask is not None:  # shape (bsz, src_len)
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights + key_padding_mask.unsqueeze(1).unsqueeze(2)
-            #reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2)#.to(torch.bool)
-            #attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
+            reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool)
+            attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = F.softmax(attn_weights, dim=-1)
         attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training,)

From 9590f160ef75dbd57705b77ab5664721e2630d36 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Tue, 17 Mar 2020 23:23:35 -0400
Subject: [PATCH 34/68] Who knows

---
 src/transformers/modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index d654c3854d37..fcf03ea1efd9 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -291,7 +291,7 @@ def forward(
         if attention_mask is not None:
             assert attention_mask.dim() == 2
 
-            attention_mask = (1.0 - attention_mask.long()) * -10000.0
+            attention_mask = (1.0 - attention_mask.long()).long()
             assert attention_mask.max() <= 0
         inputs_embeds = self.embed_tokens(input_ids)
         embed_pos = self.embed_positions(input_ids)

From bfaae34c11530ff2d3b77f96f33a3b37e21272d2 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Tue, 17 Mar 2020 23:26:02 -0400
Subject: [PATCH 35/68] del trace

---
 tests/test_modeling_bart.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 1bd2bd121587..87af8f8ab64b 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -326,10 +326,10 @@ def test_generate_fp16(self):
         config, input_ids, batch_size = self._get_config_and_data(output_past=True)
         attention_mask = input_ids.ne(1).to(torch_device)
         model = BartForConditionalGeneration(config).eval().to(torch_device).half()
-        trace = start_memory_tracing(modules_to_trace="transformers")
+        #trace = start_memory_tracing(modules_to_trace="transformers")
         model.generate(input_ids, attention_mask=attention_mask, do_sample=False, early_stopping=True)
-        summary = MemoryViewer(stop_memory_tracing(trace))
-        summary.save_line_by_line('hf_mem_half_gen.txt')
+        #summary = MemoryViewer(stop_memory_tracing(trace))
+        #summary.save_line_by_line('hf_mem_half_gen.txt')
 
 
 

From c61f3e4bde2470a40dddfe70a789deab3cb67af8 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Tue, 17 Mar 2020 23:26:27 -0400
Subject: [PATCH 36/68] del trace

---
 tests/test_modeling_bart.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 87af8f8ab64b..adc6fc869317 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -419,12 +419,12 @@ def test_mnli_inference(self):
         model = AutoModelForSequenceClassification.from_pretrained("bart-large-mnli").to(torch_device)
         inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
         # Test that model hasn't changed
-        trace = start_memory_tracing(modules_to_trace="transformers")
+        #trace = start_memory_tracing(modules_to_trace="transformers")
 
         with torch.no_grad():
             batched_logits, features = model.forward(**inputs_dict)
-        summary = MemoryViewer(stop_memory_tracing(trace))
-        summary.save_line_by_line('hf_mem.txt')
+        #summary = MemoryViewer(stop_memory_tracing(trace))
+        #summary.save_line_by_line('hf_mem.txt')
         expected_shape = torch.Size((2, 3))
         self.assertEqual(batched_logits.shape, expected_shape)
         expected_slice = torch.Tensor([[0.1907, 1.4342, -1.0289]]).to(torch_device)

From 0274d58b1eb28a1476f5160635bbf1c873e7a9c1 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Tue, 17 Mar 2020 23:28:00 -0400
Subject: [PATCH 37/68] Fix mask

---
 src/transformers/modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index fcf03ea1efd9..3a226669da24 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -292,7 +292,7 @@ def forward(
             assert attention_mask.dim() == 2
 
             attention_mask = (1.0 - attention_mask.long()).long()
-            assert attention_mask.max() <= 0
+            #assert attention_mask.max() <= 0
         inputs_embeds = self.embed_tokens(input_ids)
         embed_pos = self.embed_positions(input_ids)
         x = inputs_embeds + embed_pos

From 7629d42432ae9b62bd4ffdc7fbe7554a0c05154c Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Wed, 18 Mar 2020 00:05:28 -0400
Subject: [PATCH 38/68] cant be worse

---
 src/transformers/modeling_bart.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 3a226669da24..8458c82d8021 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -290,8 +290,7 @@ def forward(
         # check attention mask and invert
         if attention_mask is not None:
             assert attention_mask.dim() == 2
-
-            attention_mask = (1.0 - attention_mask.long()).long()
+            attention_mask = (1 - attention_mask.long()).long()
             #assert attention_mask.max() <= 0
         inputs_embeds = self.embed_tokens(input_ids)
         embed_pos = self.embed_positions(input_ids)

From 258909925034493b189177aa2bf2455963a584e9 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 09:45:04 -0400
Subject: [PATCH 39/68] bart mem utests

---
 tests/test_modeling_bart.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 51954d586b37..7a6eeff32cdd 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -396,7 +396,25 @@ def _long_tensor(tok_lst):
 
 
 TOLERANCE = 1e-4
-
+DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+@require_torch
+class MemoryTests(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        source_path = "test.source"
+        cls.lns = [" " + x.rstrip() for x in open(source_path).readlines()][:8]
+        tokenizer = BartTokenizer.from_pretrained('bart-large')
+        dct = tokenizer.batch_encode_plus(cls.lns, max_length=1024, return_tensors="pt", pad_to_max_length=True)
+        cls.ids = dct['input_ids'].to(DEFAULT_DEVICE)
+
+
+    def test_base_model_mem(self):
+        model = BartModel.from_pretrained('bart-large').to(DEFAULT_DEVICE)
+        model.reset_logs()
+        import ipdb; ipdb; ipdb.set_trace()
+        log_df = model.combine_logs()
+        log_df.to_csv('hf_batch_fwd_logs.csv')
 
 @require_torch
 class BartModelIntegrationTests(unittest.TestCase):

From ae6a7c6b2dc5e3dceac96f7253b99074d61e7e16 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 09:47:55 -0400
Subject: [PATCH 40/68] callfwd

---
 tests/test_modeling_bart.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 7a6eeff32cdd..861a8350cbee 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -411,8 +411,9 @@ def setUpClass(cls):
 
     def test_base_model_mem(self):
         model = BartModel.from_pretrained('bart-large').to(DEFAULT_DEVICE)
+        model.log_mem('after init', verbose=True)
         model.reset_logs()
-        import ipdb; ipdb; ipdb.set_trace()
+        model(self.ids)
         log_df = model.combine_logs()
         log_df.to_csv('hf_batch_fwd_logs.csv')
 

From 01218ace26a04becc9df3442650fc10b4f4c9824 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 12:19:50 -0400
Subject: [PATCH 41/68] new test file

---
 tests/test_bart_memory.py   | 28 ++++++++++++++++++++++++++++
 tests/test_modeling_bart.py | 24 +-----------------------
 2 files changed, 29 insertions(+), 23 deletions(-)
 create mode 100644 tests/test_bart_memory.py

diff --git a/tests/test_bart_memory.py b/tests/test_bart_memory.py
new file mode 100644
index 000000000000..44d84e56b395
--- /dev/null
+++ b/tests/test_bart_memory.py
@@ -0,0 +1,28 @@
+import unittest
+
+import torch
+
+from tests.utils import require_torch, slow
+from transformers import BartTokenizer, BartModel
+
+DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+@require_torch
+class MemoryTests(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        source_path = "test.source"
+        cls.lns = [" " + x.rstrip() for x in open(source_path).readlines()][:8]
+        tokenizer = BartTokenizer.from_pretrained('bart-large')
+        dct = tokenizer.batch_encode_plus(cls.lns, max_length=1024, return_tensors="pt", pad_to_max_length=True)
+        cls.ids = dct['input_ids'].to(DEFAULT_DEVICE)
+
+    def test_base_model_mem(self):
+        model = BartModel.from_pretrained('bart-large').to(DEFAULT_DEVICE)
+        model.log_mem('after init', verbose=True)
+        model.reset_logs()
+        model(self.ids)
+        log_df = model.combine_logs()
+        log_df.to_csv('hf_batch_fwd_logs.csv')
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index d3781ad5c522..a0fd63d35585 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -41,8 +41,6 @@
         LARGE_NEGATIVE,
     )
     from transformers.tokenization_bart import BartTokenizer
-    from transformers import start_memory_tracing, stop_memory_tracing
-    from transformers.file_utils import MemoryViewer
 
 
 @require_torch
@@ -267,7 +265,6 @@ def test_lm_uneven_forward(self):
         loss, logits, enc_features = lm_model(input_ids=context, decoder_input_ids=summary, lm_labels=summary)
         log_df = lm_model.combine_logs()
         tot = log_df.cpu_mem.max()-log_df.cpu_mem.min()
-        self.assertGreaterEqual(tot/1024**2, 3, )
 
     def test_generate_beam_search(self):
         input_ids = torch.Tensor([[71, 82, 2], [68, 34, 2]]).long().to(torch_device)
@@ -407,26 +404,7 @@ def _long_tensor(tok_lst):
 
 
 TOLERANCE = 1e-4
-DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-@require_torch
-class MemoryTests(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(cls):
-        source_path = "test.source"
-        cls.lns = [" " + x.rstrip() for x in open(source_path).readlines()][:8]
-        tokenizer = BartTokenizer.from_pretrained('bart-large')
-        dct = tokenizer.batch_encode_plus(cls.lns, max_length=1024, return_tensors="pt", pad_to_max_length=True)
-        cls.ids = dct['input_ids'].to(DEFAULT_DEVICE)
-
-
-    def test_base_model_mem(self):
-        model = BartModel.from_pretrained('bart-large').to(DEFAULT_DEVICE)
-        model.log_mem('after init', verbose=True)
-        model.reset_logs()
-        model(self.ids)
-        log_df = model.combine_logs()
-        log_df.to_csv('hf_batch_fwd_logs.csv')
+
 
 @require_torch
 class BartModelIntegrationTests(unittest.TestCase):

From c8cca90313341b9ece60968b344f2c94b794f48b Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 12:26:26 -0400
Subject: [PATCH 42/68] no attn_weights

---
 src/transformers/modeling_bart.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 375286b5fc51..7a87b8a59f45 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -614,8 +614,8 @@ def forward(
         assert attn_output.size() == (bsz * self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
         attn_output = self.out_proj(attn_output)
-        attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-        return attn_output, attn_weights
+        #attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        return attn_output, ()
 
     def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz):
         # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)

From f4bc62aa18c8ebef5a010e8e16f3d50806c414d9 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 12:36:26 -0400
Subject: [PATCH 43/68] undo chg

---
 src/transformers/modeling_bart.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 7a87b8a59f45..18d4a74ee41f 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -293,6 +293,7 @@ def forward(
 
         encoder_states, all_attentions = [], []
         for i, encoder_layer in enumerate(self.layers):
+
             if self.output_hidden_states:
                 encoder_states.append(x)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -301,7 +302,7 @@ def forward(
                 attn = None
             else:
                 x, attn = encoder_layer(x, attention_mask)
-            self.log_mem('encoder: called layer {i}')
+            self.log_mem('encoder: called layer {i}', verbose=True)
 
             if self.output_attentions:
                 all_attentions.append(attn)
@@ -615,7 +616,7 @@ def forward(
         attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
         attn_output = self.out_proj(attn_output)
         #attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-        return attn_output, ()
+        return attn_output, attn_weights
 
     def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz):
         # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)

From 255ebe163e6e909c7f05e38b62140566758a0fb5 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 12:45:43 -0400
Subject: [PATCH 44/68] Delay mem

---
 src/transformers/modeling_bart.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 18d4a74ee41f..e81f2f69213e 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -185,6 +185,7 @@ def make_padding_mask(input_ids, padding_idx=1):
 
 
 # Helper Modules
+from durbango.torch_utils import get_shapes
 
 
 class EncoderLayer(nn.Module):
@@ -779,6 +780,9 @@ def forward(
     ):
 
         # make masks if user doesn't supply
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        assert isinstance(encoder_outputs, tuple)
         if not generation_mode:
             decoder_input_ids, decoder_attention_mask = _prepare_bart_decoder_inputs(
                 self.config,
@@ -788,9 +792,6 @@ def forward(
                 mask_dtype=self.shared.weight.dtype,
             )
         assert decoder_input_ids is not None
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
-        assert isinstance(encoder_outputs, tuple)
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         decoder_outputs = self.decoder(
             decoder_input_ids,

From 600d62ae1e0ca051524707b7c3b07a42820575ca Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 13:12:27 -0400
Subject: [PATCH 45/68] boom boom

---
 tests/test_bart_memory.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_bart_memory.py b/tests/test_bart_memory.py
index 44d84e56b395..8365136423ec 100644
--- a/tests/test_bart_memory.py
+++ b/tests/test_bart_memory.py
@@ -14,7 +14,7 @@ class MemoryTests(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         source_path = "test.source"
-        cls.lns = [" " + x.rstrip() for x in open(source_path).readlines()][:8]
+        cls.lns = [" " + x.rstrip() for x in open(source_path).readlines()][:6]
         tokenizer = BartTokenizer.from_pretrained('bart-large')
         dct = tokenizer.batch_encode_plus(cls.lns, max_length=1024, return_tensors="pt", pad_to_max_length=True)
         cls.ids = dct['input_ids'].to(DEFAULT_DEVICE)
@@ -26,3 +26,4 @@ def test_base_model_mem(self):
         model(self.ids)
         log_df = model.combine_logs()
         log_df.to_csv('hf_batch_fwd_logs.csv')
+        print(model.summary)

From 6ff2eb53862026fb7f5d694e84a44d1517a6a241 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 13:17:42 -0400
Subject: [PATCH 46/68] boom boom

---
 src/transformers/modeling_bart.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index e81f2f69213e..994780dea4c4 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -304,6 +304,7 @@ def forward(
             else:
                 x, attn = encoder_layer(x, attention_mask)
             self.log_mem('encoder: called layer {i}', verbose=True)
+            self.save_log_csv('hf_fwd_logs.csv')
 
             if self.output_attentions:
                 all_attentions.append(attn)

From 4199a7ba01e46124c529748050ae243336cd2874 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 14:57:07 -0400
Subject: [PATCH 47/68] boom boom

---
 src/transformers/modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 994780dea4c4..4ead5eb89009 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -188,7 +188,7 @@ def make_padding_mask(input_ids, padding_idx=1):
 from durbango.torch_utils import get_shapes
 
 
-class EncoderLayer(nn.Module):
+class EncoderLayer(nn.Module, LoggingMixin):
     def __init__(self, config: BartConfig):
         super().__init__()
         self.embed_dim = config.d_model

From 8219f5c515f01fe6db8de059f2ec810071e80e61 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:08:04 -0400
Subject: [PATCH 48/68] boom boom

---
 src/transformers/modeling_bart.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 4ead5eb89009..bd1d4060470b 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -26,6 +26,7 @@
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable, collect_log_data, bytes_to_human_readable
 from .modeling_utils import PreTrainedModel, create_position_ids_from_input_ids
 from durbango.logging_utils import LoggingMixin
+from durbango.torch_utils import print_tensor_sizes, local_sizeof, get_tensor_shapes_and_pointers
 
 logger = logging.getLogger(__name__)
 
@@ -284,14 +285,14 @@ def forward(
             attention_mask = attention_mask.eq(0)
 
         inputs_embeds = self.embed_tokens(input_ids)
-        embed_pos = self.embed_positions(input_ids)
-        x = inputs_embeds + embed_pos
+        x = inputs_embeds + self.embed_positions(input_ids)
         x = self.layernorm_embedding(x)
         x = F.dropout(x, p=self.dropout, training=self.training)
+        assert not self.output_attentions or self.output_hidden_states
 
         # B x T x C -> T x B x C
         x = x.transpose(0, 1)
-
+        self.log_mem('encoder: starting_loop')
         encoder_states, all_attentions = [], []
         for i, encoder_layer in enumerate(self.layers):
 
@@ -304,7 +305,7 @@ def forward(
             else:
                 x, attn = encoder_layer(x, attention_mask)
             self.log_mem('encoder: called layer {i}', verbose=True)
-            self.save_log_csv('hf_fwd_logs.csv')
+            self.save_logs('hf_fwd_logs.txt')
 
             if self.output_attentions:
                 all_attentions.append(attn)

From 74dcbb22a2cc36c0f9efe34009054b4f0b9499f7 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:11:16 -0400
Subject: [PATCH 49/68] boom boom

---
 src/transformers/modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index bd1d4060470b..da66146ffe62 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -288,7 +288,7 @@ def forward(
         x = inputs_embeds + self.embed_positions(input_ids)
         x = self.layernorm_embedding(x)
         x = F.dropout(x, p=self.dropout, training=self.training)
-        assert not self.output_attentions or self.output_hidden_states
+        assert not (self.output_attentions or self.output_hidden_states)
 
         # B x T x C -> T x B x C
         x = x.transpose(0, 1)

From 685d89255aa689943e21419efdb6a4756af26d7b Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:13:19 -0400
Subject: [PATCH 50/68] boom boom

---
 src/transformers/modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index da66146ffe62..96fd0cd92415 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -304,7 +304,7 @@ def forward(
                 attn = None
             else:
                 x, attn = encoder_layer(x, attention_mask)
-            self.log_mem('encoder: called layer {i}', verbose=True)
+            self.log_mem(f'encoder: called layer {i}', verbose=True)
             self.save_logs('hf_fwd_logs.txt')
 
             if self.output_attentions:

From 8fd4be3b8811e82570205177549e553821a80927 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:22:01 -0400
Subject: [PATCH 51/68] undo thom chg

---
 src/transformers/__init__.py      |   3 -
 src/transformers/file_utils.py    | 326 ------------------------------
 src/transformers/modeling_bart.py |   4 +-
 3 files changed, 3 insertions(+), 330 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index bd9f10f5f8b5..a41ae430dca7 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -83,12 +83,9 @@
     WEIGHTS_NAME,
     add_end_docstrings,
     add_start_docstrings,
-    bytes_to_human_readable,
     cached_path,
     is_tf_available,
     is_torch_available,
-    start_memory_tracing,
-    stop_memory_tracing,
 )
 
 # Model Cards
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index cf9de383eb94..dfc6d1a8feff 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -6,14 +6,12 @@
 
 import fnmatch
 import json
-import linecache
 import logging
 import os
 import shutil
 import sys
 import tarfile
 import tempfile
-from collections import defaultdict, namedtuple
 from contextlib import contextmanager
 from functools import partial, wraps
 from hashlib import sha256
@@ -30,7 +28,6 @@
 
 from . import __version__
 
-from durbango.logging_utils import LoggingMixin
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
@@ -499,326 +496,3 @@ def _resumable_file_manager():
             json.dump(meta, meta_file)
 
     return cache_path
-
-
-_memory_tracing_enabled = False
-Frame = namedtuple("Frame", ["filename", "module", "line_number", "event", "line_text"])
-UsedMemoryState = namedtuple("UsedMemoryState", ["frame", "cpu_memory", "gpu_memory"])
-
-
-def start_memory_tracing(modules_to_trace=None, modules_not_to_trace=None, events_to_trace="line", gpus_to_trace=None):
-    """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
-        See `../../examples/benchmarks.py for a usage example.
-        Current memory consumption is returned using psutil and in particular is the RSS memory
-            "Resident Set Size” (the non-swapped physical memory the process is using).
-            See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
-
-        Args:
-            - `modules_to_trace`: (None, string, list/tuple of string)
-                if None, all events are recorded
-                if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
-            - `modules_not_to_trace`: (None, string, list/tuple of string)
-                if None, no module is avoided
-                if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
-            - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
-                default to line
-            - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
-
-        Return:
-            - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
-                - `UsedMemoryState` are named tuples with the following fields:
-                    - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
-                    - 'cpu_memory': CPU RSS memory state *before* executing the line
-                    - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
-
-        `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
-            `Frame` has the following fields:
-            - 'filename' (string): Name of the file currently executed
-            - 'module' (string): Name of the module currently executed
-            - 'line_number' (int): Number of the line currently executed
-            - 'event' (string): Event that triggered the tracing (default will be "line")
-            - 'line_text' (string): Text of the line in the python script
-
-    """
-    try:
-        import psutil
-    except (ImportError):
-        logger.warning(
-            "Psutil not installed, we won't log CPU memory usage. "
-            "Install psutil (pip install psutil) to use CPU memory tracing."
-        )
-        process = None
-    else:
-        process = psutil.Process(os.getpid())
-
-    try:
-        from py3nvml import py3nvml
-
-        py3nvml.nvmlInit()
-        devices = list(range(py3nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
-        py3nvml.nvmlShutdown()
-    except ImportError:
-        logger.warning(
-            "py3nvml not installed, we won't log GPU memory usage. "
-            "Install py3nvml (pip install py3nvml) to use GPU memory tracing."
-        )
-        log_gpu = False
-    except (OSError, py3nvml.NVMLError):
-        logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
-        log_gpu = False
-    else:
-        log_gpu = _torch_available or _tf_available
-
-    memory_trace = []
-
-    def traceit(frame, event, args):
-        """ Tracing method executed before running each line in a module or sub-module
-            Record memory allocated in a list with debugging information
-        """
-        global _memory_tracing_enabled
-
-        if not _memory_tracing_enabled:
-            return traceit
-
-        # Filter events
-        if events_to_trace is not None:
-            if isinstance(events_to_trace, str) and event != events_to_trace:
-                return traceit
-            elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace:
-                return traceit
-
-        # Filter modules
-        name = frame.f_globals["__name__"]
-        if not isinstance(name, str):
-            return traceit
-        else:
-            # Filter whitelist of modules to trace
-            if modules_to_trace is not None:
-                if isinstance(modules_to_trace, str) and modules_to_trace not in name:
-                    return traceit
-                elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace):
-                    return traceit
-
-            # Filter blacklist of modules not to trace
-            if modules_not_to_trace is not None:
-                if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name:
-                    return traceit
-                elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace):
-                    return traceit
-
-        # Record current tracing state (file, location in file...)
-        lineno = frame.f_lineno
-        filename = frame.f_globals["__file__"]
-        if filename.endswith(".pyc") or filename.endswith(".pyo"):
-            filename = filename[:-1]
-        line = linecache.getline(filename, lineno).rstrip()
-        traced_state = Frame(filename, name, lineno, event, line)
-
-        # Record current memory state (rss memory) and compute difference with previous memory state
-        cpu_mem = 0
-        if process is not None:
-            mem = process.memory_info()
-            cpu_mem = mem.rss
-
-        gpu_mem = 0
-        if log_gpu:
-            # Clear GPU caches
-            if _torch_available:
-                torch.cuda.empty_cache()
-            if _tf_available:
-                from tensorflow.python.eager import context
-
-                context.context()._clear_caches()  # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802
-
-            # Sum used memory for all GPUs
-            py3nvml.nvmlInit()
-            for i in devices:
-                handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
-                meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
-                gpu_mem += meminfo.used
-            py3nvml.nvmlShutdown()
-
-        mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)
-        memory_trace.append(mem_state)
-
-        return traceit
-
-    sys.settrace(traceit)
-
-    global _memory_tracing_enabled
-    _memory_tracing_enabled = True
-
-    return memory_trace
-
-
-from py3nvml import py3nvml
-
-
-def run_gpu_mem_counter():
-    # Sum used memory for all GPUs
-    if not torch.cuda.is_available(): return 0
-    py3nvml.nvmlInit()
-    devices = list(range(py3nvml.nvmlDeviceGetCount())) #if gpus_to_trace is None else gpus_to_trace
-    gpu_mem = 0
-    for i in devices:
-        handle = py3nvml.nvmlDeviceGetHandleByIndex(i)
-        meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle)
-        gpu_mem += meminfo.used
-    py3nvml.nvmlShutdown()
-    return gpu_mem
-
-
-Memory = namedtuple("Memory", ["bytes", "string"])
-MemoryState = namedtuple("MemoryState", ["frame", "cpu", "gpu", "cpu_gpu"])
-MemorySummary = namedtuple("MemorySummary", ["sequential", "cumulative", "total"])
-
-
-def stop_memory_tracing(memory_trace=None, ignore_released_memory=True):
-    """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
-
-        Args:
-            - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
-            - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
-
-        Return:
-            - None if `memory_trace` is None
-            - `MemorySummary` namedtuple otherwise with the fields:
-                - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
-                    by substracting the memory after executing each line from the memory before executing said line.
-                - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
-                    obtained by summing repeted memory increase for a line if it's executed several times.
-                    The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
-                - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
-                    Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
-
-        `Memory` named tuple have fields
-            - `byte` (integer): number of bytes,
-            - `string` (string): same as human readable string (ex: "3.5MB")
-
-        `Frame` are namedtuple used to list the current frame state and have the following fields:
-            - 'filename' (string): Name of the file currently executed
-            - 'module' (string): Name of the module currently executed
-            - 'line_number' (int): Number of the line currently executed
-            - 'event' (string): Event that triggered the tracing (default will be "line")
-            - 'line_text' (string): Text of the line in the python script
-
-        `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
-            - `frame` (`Frame`): the current frame (see above)
-            - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
-            - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
-            - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
-    """
-    global _memory_tracing_enabled
-    _memory_tracing_enabled = False
-
-    if memory_trace is not None and len(memory_trace) > 1:
-        memory_diff_trace = []
-        cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
-        for (frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem) in zip(
-            memory_trace[:-1], memory_trace[1:]
-        ):
-            cpu_mem_inc = next_cpu_mem - cpu_mem
-            cpu_mem_str = bytes_to_human_readable(cpu_mem_inc)
-            gpu_mem_inc = next_gpu_mem - gpu_mem
-            gpu_mem_str = bytes_to_human_readable(gpu_mem_inc)
-            cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
-            cpu_gpu_mem_str = bytes_to_human_readable(cpu_gpu_mem_inc)
-            memory_diff_trace.append(
-                MemoryState(
-                    frame=frame,
-                    cpu=Memory(cpu_mem_inc, cpu_mem_str),
-                    gpu=Memory(gpu_mem_inc, gpu_mem_str),
-                    cpu_gpu=Memory(cpu_gpu_mem_inc, cpu_gpu_mem_str),
-                )
-            )
-            cumulative_memory_dict[frame][0] += cpu_mem_inc
-            cumulative_memory_dict[frame][1] += gpu_mem_inc
-            cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc
-
-        cumulative_memory = sorted(
-            list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True
-        )  # order by the total CPU + GPU memory increase
-        cumulative_memory = list(
-            MemoryState(
-                frame=frame,
-                cpu=Memory(cpu_mem_inc, bytes_to_human_readable(cpu_mem_inc)),
-                gpu=Memory(gpu_mem_inc, bytes_to_human_readable(gpu_mem_inc)),
-                cpu_gpu=Memory(cpu_gpu_mem_inc, bytes_to_human_readable(cpu_gpu_mem_inc)),
-            )
-            for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
-        )
-
-        if ignore_released_memory:
-            total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace)
-        else:
-            total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace)
-        total_memory = Memory(bytes=total_memory, string=bytes_to_human_readable(total_memory))
-        return MemorySummary(sequential=memory_diff_trace, cumulative=cumulative_memory, total=total_memory)
-
-    return None
-
-
-def bytes_to_human_readable(memory_amount):
-    """ Utility to convert a number of bytes (int) in a human readable string (with units)
-    """
-    for unit in ["B", "KB", "MB", "GB"]:
-        if memory_amount > -1024.0 and memory_amount < 1024.0:
-            return "{:.3f}{}".format(memory_amount, unit)
-        memory_amount /= 1024.0
-    return "{:.3f}TB".format(memory_amount)
-import psutil
-import time
-
-def collect_log_data(msg=''):
-    process = psutil.Process(os.getpid())
-    cpu_mem = process.memory_info().rss
-    gpu_mem = run_gpu_mem_counter()
-    record = dict(cpu_mem=cpu_mem, gpu_mem=gpu_mem,
-         time = time.time(),
-         msg=msg)
-    long_msg = f'{msg}: GPU: {bytes_to_human_readable(gpu_mem)} CPU: {bytes_to_human_readable(gpu_mem)}'
-    record['long_msg'] = long_msg
-    print(long_msg)
-    return record
-
-class MemoryViewer:
-
-    def __init__(self, summary):
-        self.summary = summary
-    @property
-    def total(self):
-        print(f"\nTotal memory increase: {self.summary.total.string}")
-
-    @property
-    def line_by_line(self):
-        return (
-            "\nLines by line memory consumption:\n"
-            + "\n".join(
-                f"{frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
-                for frame, _, _, cpu_gpu_mem in self.summary.sequential
-            )
-        )
-
-    def save_line_by_line(self, path):
-        res = self.line_by_line
-        with open(path, 'w') as f:
-            f.write(res)
-
-    def top_lines(self, n=6):
-        print(
-            "\nLines with top memory consumption:\n"
-            + "\n".join(
-                f"=> {frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
-                for frame, _, _, cpu_gpu_mem in self.summary.cumulative[:n]
-            )
-        )
-
-
-    def bottom_lines(self, n=6):
-        print(
-            "\nLines with top memory consumption:\n"
-            + "\n".join(
-                f"=> {frame.filename}:{frame.line_number}: mem {cpu_gpu_mem.string}: {frame.line_text}"
-                for frame, _, _, cpu_gpu_mem in self.summary.cumulative[-n:]
-            )
-        )
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 96fd0cd92415..d3df43c37400 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -23,7 +23,7 @@
 
 from .activations import ACT2FN
 from .configuration_bart import BartConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable, collect_log_data, bytes_to_human_readable
+from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, create_position_ids_from_input_ids
 from durbango.logging_utils import LoggingMixin
 from durbango.torch_utils import print_tensor_sizes, local_sizeof, get_tensor_shapes_and_pointers
@@ -306,6 +306,8 @@ def forward(
                 x, attn = encoder_layer(x, attention_mask)
             self.log_mem(f'encoder: called layer {i}', verbose=True)
             self.save_logs('hf_fwd_logs.txt')
+            if i > 10:
+                rdd = print_tensor_sizes()
 
             if self.output_attentions:
                 all_attentions.append(attn)

From 12cf809beb8dd0acc3b44d19455759b1dc459997 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:24:02 -0400
Subject: [PATCH 52/68] boom boom

---
 src/transformers/modeling_bart.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index d3df43c37400..3102cf90713c 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -294,6 +294,7 @@ def forward(
         x = x.transpose(0, 1)
         self.log_mem('encoder: starting_loop')
         encoder_states, all_attentions = [], []
+        rdd_start = print_tensor_sizes()
         for i, encoder_layer in enumerate(self.layers):
 
             if self.output_hidden_states:
@@ -308,6 +309,7 @@ def forward(
             self.save_logs('hf_fwd_logs.txt')
             if i > 10:
                 rdd = print_tensor_sizes()
+                rdd.to_csv(f'rdd_step_{i}.csv')
 
             if self.output_attentions:
                 all_attentions.append(attn)

From ed1c07ff1d42b6c21af3f71d2190acade8031f10 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:27:42 -0400
Subject: [PATCH 53/68] boom boom

---
 src/transformers/modeling_bart.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 3102cf90713c..dc616273d897 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -294,7 +294,8 @@ def forward(
         x = x.transpose(0, 1)
         self.log_mem('encoder: starting_loop')
         encoder_states, all_attentions = [], []
-        rdd_start = print_tensor_sizes()
+        #rdd_start = print_tensor_sizes()
+        #rdd_start.to_csv(f'rdd_start.csv')
         for i, encoder_layer in enumerate(self.layers):
 
             if self.output_hidden_states:
@@ -305,8 +306,11 @@ def forward(
                 attn = None
             else:
                 x, attn = encoder_layer(x, attention_mask)
+            assert len(encoder_states) == 0
+            self.log_mem(f'x: {x.shape}, attn: {attn.shape}')
             self.log_mem(f'encoder: called layer {i}', verbose=True)
             self.save_logs('hf_fwd_logs.txt')
+
             if i > 10:
                 rdd = print_tensor_sizes()
                 rdd.to_csv(f'rdd_step_{i}.csv')

From 96e701bc1e81037eafa8144e409f4a2abb4f9d2e Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:29:51 -0400
Subject: [PATCH 54/68] boom boom

---
 src/transformers/modeling_bart.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index dc616273d897..9a5aa21b92d2 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -233,7 +233,7 @@ def forward(self, x, encoder_padding_mask):
         x = self.final_layer_norm(x)
         return x, attn_weights
 
-
+import gc
 class BartEncoder(nn.Module, LoggingMixin):
     """
     Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer
@@ -307,6 +307,7 @@ def forward(
             else:
                 x, attn = encoder_layer(x, attention_mask)
             assert len(encoder_states) == 0
+            assert len(all_attentions) == 0
             self.log_mem(f'x: {x.shape}, attn: {attn.shape}')
             self.log_mem(f'encoder: called layer {i}', verbose=True)
             self.save_logs('hf_fwd_logs.txt')
@@ -627,7 +628,7 @@ def forward(
         attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
         attn_output = self.out_proj(attn_output)
         #attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-        return attn_output, attn_weights
+        return attn_output, None
 
     def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz):
         # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)

From 975c2829ef9415e945b1fa1ce09ace75dac4cd15 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:31:37 -0400
Subject: [PATCH 55/68] boom boom

---
 src/transformers/modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 9a5aa21b92d2..106059573dbd 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -308,7 +308,7 @@ def forward(
                 x, attn = encoder_layer(x, attention_mask)
             assert len(encoder_states) == 0
             assert len(all_attentions) == 0
-            self.log_mem(f'x: {x.shape}, attn: {attn.shape}')
+            #self.log_mem(f'x: {x.shape}, attn: {attn.shape}')
             self.log_mem(f'encoder: called layer {i}', verbose=True)
             self.save_logs('hf_fwd_logs.txt')
 

From 3b81c2035053d63839f5fbcae6e00664788ec807 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:33:02 -0400
Subject: [PATCH 56/68] boom boom

---
 src/transformers/modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 106059573dbd..f987be61a2af 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -312,7 +312,7 @@ def forward(
             self.log_mem(f'encoder: called layer {i}', verbose=True)
             self.save_logs('hf_fwd_logs.txt')
 
-            if i > 10:
+            if i >= 8:
                 rdd = print_tensor_sizes()
                 rdd.to_csv(f'rdd_step_{i}.csv')
 

From c572b139bc5d8809326554a6b27fe5d8bd944fc3 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:36:02 -0400
Subject: [PATCH 57/68] boom boom

---
 src/transformers/modeling_bart.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index f987be61a2af..2d17f3137ee4 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -312,9 +312,8 @@ def forward(
             self.log_mem(f'encoder: called layer {i}', verbose=True)
             self.save_logs('hf_fwd_logs.txt')
 
-            if i >= 8:
-                rdd = print_tensor_sizes()
-                rdd.to_csv(f'rdd_step_{i}.csv')
+            gc.collect()
+            torch.cuda.empty_cache()
 
             if self.output_attentions:
                 all_attentions.append(attn)

From 32959ea6ec3905f3bc8b8690442a97e4b3ee9ba0 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:39:42 -0400
Subject: [PATCH 58/68] boom boom

---
 src/transformers/modeling_bart.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 2d17f3137ee4..cc9c5b34bed3 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -598,14 +598,17 @@ def forward(
                 "prev_value": v.view(bsz, self.num_heads, -1, self.head_dim),
                 "prev_key_padding_mask": key_padding_mask if not static_kv else None,
             }
+            self.log_mem('\t attn: done layer_state')
 
         assert k is not None
         src_len = k.size(1)
         attn_weights = torch.bmm(q, k.transpose(1, 2))
+        self.log_mem('\t attn: done BMM(q,k)')
         assert attn_weights.size() == (bsz * self.num_heads, tgt_len, src_len)
 
         if attn_mask is not None:
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_mask
+            self.log_mem('\t attn: done causal mask')
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
@@ -617,15 +620,21 @@ def forward(
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2)
             attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
+            self.log_mem('\t attn: done masked_fill')
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = F.softmax(attn_weights, dim=-1)
+        self.log_mem('\t attn: done softmax')
         attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training,)
 
+
         assert v is not None
         attn_output = torch.bmm(attn_probs, v)
+        self.log_mem('\t attn: done BMM(probs, v)')
         assert attn_output.size() == (bsz * self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        self.log_mem('\t attn: done view(output)')
         attn_output = self.out_proj(attn_output)
+        self.log_mem('\t attn: done out_proj')
         #attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
         return attn_output, None
 

From 5a91a71d1e7d49f598256fbdd6dfdc77557e3ca6 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 15:52:46 -0400
Subject: [PATCH 59/68] v similar

---
 src/transformers/modeling_bart.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index cc9c5b34bed3..bcfc5da541f3 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -305,22 +305,13 @@ def forward(
             if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                 attn = None
             else:
-                x, attn = encoder_layer(x, attention_mask)
+                x, _ = encoder_layer(x, attention_mask)
             assert len(encoder_states) == 0
             assert len(all_attentions) == 0
             #self.log_mem(f'x: {x.shape}, attn: {attn.shape}')
             self.log_mem(f'encoder: called layer {i}', verbose=True)
             self.save_logs('hf_fwd_logs.txt')
 
-            gc.collect()
-            torch.cuda.empty_cache()
-
-            if self.output_attentions:
-                all_attentions.append(attn)
-
-        if self.output_hidden_states:
-            encoder_states.append(x)
-
         encoder_states = [hidden_state.transpose(0, 1) for hidden_state in encoder_states]
         return x, encoder_states, all_attentions
 

From bcfd0d4fdf27e41232a34f5fed417fbf62daf935 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 16:12:49 -0400
Subject: [PATCH 60/68] boom boom

---
 tests/test_bart_memory.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_bart_memory.py b/tests/test_bart_memory.py
index 8365136423ec..d976f8557bd7 100644
--- a/tests/test_bart_memory.py
+++ b/tests/test_bart_memory.py
@@ -21,9 +21,12 @@ def setUpClass(cls):
 
     def test_base_model_mem(self):
         model = BartModel.from_pretrained('bart-large').to(DEFAULT_DEVICE)
-        model.log_mem('after init', verbose=True)
         model.reset_logs()
         model(self.ids)
         log_df = model.combine_logs()
         log_df.to_csv('hf_batch_fwd_logs.csv')
+        model.save_logs('hf_batch_fwd_logs.txt')
         print(model.summary)
+
+
+

From 24c56e8697f6697117db39664540d43687555dd9 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 16:13:31 -0400
Subject: [PATCH 61/68] boom boom

---
 src/transformers/modeling_bart.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index bcfc5da541f3..89a0ac332f60 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -309,7 +309,7 @@ def forward(
             assert len(encoder_states) == 0
             assert len(all_attentions) == 0
             #self.log_mem(f'x: {x.shape}, attn: {attn.shape}')
-            self.log_mem(f'encoder: called layer {i}', verbose=True)
+            self.log_mem(f'Encoder: called layer {i}', verbose=True)
             self.save_logs('hf_fwd_logs.txt')
 
         encoder_states = [hidden_state.transpose(0, 1) for hidden_state in encoder_states]

From d78ecc7d52a798affd12b22ee475741a675fe4a5 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 16:21:50 -0400
Subject: [PATCH 62/68] boom boom

---
 tests/test_bart_memory.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/test_bart_memory.py b/tests/test_bart_memory.py
index d976f8557bd7..daf1b2c18ec3 100644
--- a/tests/test_bart_memory.py
+++ b/tests/test_bart_memory.py
@@ -4,10 +4,38 @@
 
 from tests.utils import require_torch, slow
 from transformers import BartTokenizer, BartModel
+from transformers.modeling_bart import shift_tokens_right
 
 DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
+class TestHface(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        source_path = "test.source"
+        cls.lns = [" " + x.rstrip() for x in open(source_path).readlines()][:6]
+        tokenizer = BartTokenizer.from_pretrained('bart-large')
+        dct = tokenizer.batch_encode_plus(cls.lns, max_length=1024, return_tensors="pt", pad_to_max_length=True)
+        cls.ids = dct['input_ids'].to(DEFAULT_DEVICE)
+        cls.prev_output_tokens = shift_tokens_right(cls.ids, 1).to(DEFAULT_DEVICE)
+        cls.model = BartModel.from_pretrained('bart-large').to(DEFAULT_DEVICE)
+        #cls.lns = pickle_load('/Users/shleifer/transformers_fork/lns.pkl')
+        return cls
+
+    def test_hf_fwd_batch(self):
+        bart = self.model
+        bart.reset_logs()
+        with torch.no_grad():
+            bart(self.ids)
+        try:
+            log_df = bart.combine_logs()
+            log_df.to_csv('hf_batch_fwd_logs.csv')
+            bart.save_logs('hf_batch_fwd_logs.txt')
+            print(bart.summary)
+        except AttributeError as e:
+            print(e)
+
 @require_torch
 class MemoryTests(unittest.TestCase):
 

From 35377760131d84d89759e65fafde4b49c5005616 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 16:24:35 -0400
Subject: [PATCH 63/68] boom boom

---
 tests/test_bart_memory.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_bart_memory.py b/tests/test_bart_memory.py
index daf1b2c18ec3..c9506bf4dabe 100644
--- a/tests/test_bart_memory.py
+++ b/tests/test_bart_memory.py
@@ -50,7 +50,8 @@ def setUpClass(cls):
     def test_base_model_mem(self):
         model = BartModel.from_pretrained('bart-large').to(DEFAULT_DEVICE)
         model.reset_logs()
-        model(self.ids)
+        with torch.no_grad():
+            model(self.ids)
         log_df = model.combine_logs()
         log_df.to_csv('hf_batch_fwd_logs.csv')
         model.save_logs('hf_batch_fwd_logs.txt')

From fb01c11455a7b3c05ffc7fe0ea53556b5a9be5a0 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 18:19:00 -0400
Subject: [PATCH 64/68] not verbose encoder

---
 src/transformers/modeling_bart.py |  2 +-
 tests/test_bart_memory.py         | 27 ++-------------------------
 2 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 89a0ac332f60..20cdeae34612 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -309,7 +309,7 @@ def forward(
             assert len(encoder_states) == 0
             assert len(all_attentions) == 0
             #self.log_mem(f'x: {x.shape}, attn: {attn.shape}')
-            self.log_mem(f'Encoder: called layer {i}', verbose=True)
+            self.log_mem(f'Encoder: called layer {i}')
             self.save_logs('hf_fwd_logs.txt')
 
         encoder_states = [hidden_state.transpose(0, 1) for hidden_state in encoder_states]
diff --git a/tests/test_bart_memory.py b/tests/test_bart_memory.py
index c9506bf4dabe..222222e30cfb 100644
--- a/tests/test_bart_memory.py
+++ b/tests/test_bart_memory.py
@@ -8,7 +8,7 @@
 
 DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
-
+@require_torch
 class TestHface(unittest.TestCase):
 
     @classmethod
@@ -30,32 +30,9 @@ def test_hf_fwd_batch(self):
             bart(self.ids)
         try:
             log_df = bart.combine_logs()
-            log_df.to_csv('hf_batch_fwd_logs.csv')
+            #log_df.to_csv('hf_batch_fwd_logs.csv')
             bart.save_logs('hf_batch_fwd_logs.txt')
             print(bart.summary)
         except AttributeError as e:
             print(e)
 
-@require_torch
-class MemoryTests(unittest.TestCase):
-
-    @classmethod
-    def setUpClass(cls):
-        source_path = "test.source"
-        cls.lns = [" " + x.rstrip() for x in open(source_path).readlines()][:6]
-        tokenizer = BartTokenizer.from_pretrained('bart-large')
-        dct = tokenizer.batch_encode_plus(cls.lns, max_length=1024, return_tensors="pt", pad_to_max_length=True)
-        cls.ids = dct['input_ids'].to(DEFAULT_DEVICE)
-
-    def test_base_model_mem(self):
-        model = BartModel.from_pretrained('bart-large').to(DEFAULT_DEVICE)
-        model.reset_logs()
-        with torch.no_grad():
-            model(self.ids)
-        log_df = model.combine_logs()
-        log_df.to_csv('hf_batch_fwd_logs.csv')
-        model.save_logs('hf_batch_fwd_logs.txt')
-        print(model.summary)
-
-
-

From 9462adf190342a8512a468978ee39dcec78c8301 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Thu, 19 Mar 2020 18:19:21 -0400
Subject: [PATCH 65/68] not verbose encoder

---
 src/transformers/modeling_bart.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 20cdeae34612..67ffcb713ec2 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -310,8 +310,6 @@ def forward(
             assert len(all_attentions) == 0
             #self.log_mem(f'x: {x.shape}, attn: {attn.shape}')
             self.log_mem(f'Encoder: called layer {i}')
-            self.save_logs('hf_fwd_logs.txt')
-
         encoder_states = [hidden_state.transpose(0, 1) for hidden_state in encoder_states]
         return x, encoder_states, all_attentions
 

From 4fc9477293f3faa9c58552530c54af320089d891 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Fri, 20 Mar 2020 13:22:45 -0400
Subject: [PATCH 66/68] inline the unsquoze key_padding_mask

---
 src/transformers/modeling_bart.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 67ffcb713ec2..486baa4790cb 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -568,13 +568,14 @@ def forward(
             k = self.k_proj(query)
             v = self.v_proj(query)
 
+
         q = self._shape(q, tgt_len, bsz)
-        self.log_mem('\tq_reshape')
+        self.log_mem(f'\tq_reshape -> {q.shape}')
         if k is not None:
             k = self._shape(k, -1, bsz)
         if v is not None:
             v = self._shape(v, -1, bsz)
-            self.log_mem('\t done reshaping k,v')
+        self.log_mem(f'\t done reshaping k,v ->, {k.shape}')
 
         if saved_state is not None:
             self.log_mem('\t about to use saved_state')
@@ -591,6 +592,7 @@ def forward(
 
         assert k is not None
         src_len = k.size(1)
+        self.log_mem('\t attn: before BMM(q,k)')
         attn_weights = torch.bmm(q, k.transpose(1, 2))
         self.log_mem('\t attn: done BMM(q,k)')
         assert attn_weights.size() == (bsz * self.num_heads, tgt_len, src_len)
@@ -607,8 +609,7 @@ def forward(
 
         if key_padding_mask is not None:  # shape (bsz, src_len)
             attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2)
-            attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
+            attn_weights = attn_weights.masked_fill(key_padding_mask.unsqueeze(1).unsqueeze(2), float("-inf"))
             self.log_mem('\t attn: done masked_fill')
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = F.softmax(attn_weights, dim=-1)

From 032627906f8f0daad5d4ed9f36df4d125e9d3f40 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Fri, 20 Mar 2020 13:25:05 -0400
Subject: [PATCH 67/68] boom boom

---
 src/transformers/modeling_bart.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index 486baa4790cb..ed982fcdd0c7 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -573,9 +573,10 @@ def forward(
         self.log_mem(f'\tq_reshape -> {q.shape}')
         if k is not None:
             k = self._shape(k, -1, bsz)
+            self.log_mem(f'\t done reshaping k,v ->, {k.shape}')
         if v is not None:
             v = self._shape(v, -1, bsz)
-        self.log_mem(f'\t done reshaping k,v ->, {k.shape}')
+
 
         if saved_state is not None:
             self.log_mem('\t about to use saved_state')

From d615cae8f46ab72eeb6e9edb9768b05b983e7872 Mon Sep 17 00:00:00 2001
From: sshleifer <sshleifer@gmail.com>
Date: Fri, 20 Mar 2020 18:33:43 -0400
Subject: [PATCH 68/68] rearrange encoder_outputs call

---
 src/transformers/modeling_utils.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 97bee1809124..f8723c89ddba 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -895,6 +895,16 @@ def generate(
             effective_batch_size = batch_size
             effective_batch_mult = 1
 
+        if self.config.is_encoder_decoder:
+            assert bos_token_id is not None, "Encoder Decoder Models need to have a bos_token_id"
+            assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self)
+            assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder)
+
+            # get encoder and store encoder outputs
+            encoder = self.get_encoder()
+
+            encoder_outputs = encoder(input_ids, attention_mask=attention_mask)
+
         # Expand input ids if num_beams > 1 or num_return_sequences > 1
         if num_return_sequences > 1 or num_beams > 1:
             input_ids_len = input_ids.shape[-1]
@@ -910,16 +920,8 @@ def generate(
                 effective_batch_size * num_beams, input_ids_len
             )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
 
-        if self.config.is_encoder_decoder:
-            assert bos_token_id is not None, "Encoder Decoder Models need to have a bos_token_id"
-            assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self)
-            assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder)
-
-            # get encoder and store encoder outputs
-            encoder = self.get_encoder()
-
-            encoder_outputs = encoder(input_ids, attention_mask=attention_mask)
 
+        if self.config.is_encoder_decoder:
             # create empty decoder_input_ids
             input_ids = torch.full(
                 (effective_batch_size * num_beams, 1),