Vendor import of llvm trunk r256945:

https://llvm.org/svn/llvm-project/llvm/trunk@256945
2026-05-28 04:12:45 -04:00 · 2016-01-06 20:01:02 +00:00 · 2016-01-06 20:01:02 +00:00 · 8a6c1c25bc
commit 8a6c1c25bc
parent 84fe440ded
244 changed files with 8769 additions and 3695 deletions
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@ -26,7 +26,10 @@ Quick start
 We use here the command-line, non-interactive CMake interface.

 #. `Download <http://www.cmake.org/cmake/resources/software.html>`_ and install
-   CMake. Version 2.8.8 is the minimum required.
+   CMake. Version 2.8.8 is the minimum required, but if you're using the Ninja
+   backend, CMake v3.2 or newer is required to `get interactive output
+   <http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20141117/244797.html>`_
+   when running :doc:`Lit <CommandGuide/lit>`.

 #. Open a shell. Your development tools must be reachable from this shell
   through the PATH environment variable.
--- a/docs/CoverageMappingFormat.rst
+++ b/docs/CoverageMappingFormat.rst
@ -241,15 +241,25 @@ For example, let’s consider a C file and how it gets compiled to LLVM:
    return 13;
  }

-The coverage mapping variable generated by Clang is:
+The coverage mapping variable generated by Clang has 3 fields:
+
+* Coverage mapping header.
+
+* An array of function records.
+
+* Coverage mapping data which is an array of bytes. Zero paddings are added at the end to force 8 byte alignment.

 .. code-block:: llvm

-  @__llvm_coverage_mapping = internal constant { i32, i32, i32, i32, [2 x { i8*, i32, i32 }], [40 x i8] }
-  { i32 2,  ; The number of function records
-    i32 20, ; The length of the string that contains the encoded translation unit filenames
-    i32 20, ; The length of the string that contains the encoded coverage mapping data
-    i32 0,  ; Coverage mapping format version
+  @__llvm_coverage_mapping = internal constant { { i32, i32, i32, i32 }, [2 x { i8*, i32, i32 }], [40 x i8] }
+  { 
+    { i32, i32, i32, i32 } ; Coverage map header
+    {
+      i32 2,  ; The number of function records
+      i32 20, ; The length of the string that contains the encoded translation unit filenames
+      i32 20, ; The length of the string that contains the encoded coverage mapping data
+      i32 0,  ; Coverage mapping format version
+    },
    [2 x { i8*, i32, i32 }] [ ; Function records
     { i8*, i32, i32 } { i8* getelementptr inbounds ([3 x i8]* @__llvm_profile_name_foo, i32 0, i32 0), ; Function's name
       i32 3, ; Function's name length
@ -262,12 +272,18 @@ The coverage mapping variable generated by Clang is:
   [40 x i8] c"..." ; Encoded data (dissected later)
  }, section "__llvm_covmap", align 8

-Version:
--------
+Coverage Mapping Header:
+------------------------

-The coverage mapping version number can have the following values:
+The coverage mapping header has the following fields:

-* 0 — The first (current) version of the coverage mapping format.
+* The number of function records.
+
+* The length of the string in the third field of *__llvm_coverage_mapping* that contains the encoded translation unit filenames.
+
+* The length of the string in the third field of *__llvm_coverage_mapping* that contains the encoded coverage mapping data.
+
+* The format version. 0 is the first (current) version of the coverage mapping format.

 .. _function records:

@ -331,7 +347,7 @@ IR for the `coverage mapping sample`_ that was shown earlier:
 * The length of the substring that contains the encoded coverage mapping data
  for the first function is the value of the third field in the first
  structure in an array of `function records`_ stored in the
-  fifth field of the *__llvm_coverage_mapping* structure, which is the 9.
+  third field of the *__llvm_coverage_mapping* structure, which is the 9.
  Therefore, the coverage mapping for the first function record is encoded
  in this string:

@ -351,7 +367,7 @@ IR for the `coverage mapping sample`_ that was shown earlier:
  | ``0x01`` | The number of mapping regions that are stored in an array for the function's file id #0.                                |
  +----------+-------------------------------------------------------------------------------------------------------------------------+
  | ``0x01`` | The coverage mapping counter for the first region in this function. The value of 1 tells us that it's a coverage        |
-  |          | mapping counter that is a reference ot the profile instrumentation counter with an index of 0.                          |
+  |          | mapping counter that is a reference to the profile instrumentation counter with an index of 0.                          |
  +----------+-------------------------------------------------------------------------------------------------------------------------+
  | ``0x01`` | The starting line of the first mapping region in this function.                                                         |
  +----------+-------------------------------------------------------------------------------------------------------------------------+
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@ -78,6 +78,8 @@ Here's the short story for getting up and running quickly with LLVM:

   The usual build uses `CMake <CMake.html>`_. If you would rather use
   autotools, see `Building LLVM with autotools <BuildingLLVMWithAutotools.html>`_.
+   Although the build is known to work with CMake >= 2.8.8, we recommend CMake
+   >= v3.2, especially if you're generating Ninja build files.

   * ``cd where you want to build llvm``
   * ``mkdir build``
--- a/docs/MCJITDesignAndImplementation.rst
+++ b/docs/MCJITDesignAndImplementation.rst
@ -1,180 +1,180 @@
-===============================
-MCJIT Design and Implementation
-===============================
-
-Introduction
-============
-
-This document describes the internal workings of the MCJIT execution
-engine and the RuntimeDyld component.  It is intended as a high level
-overview of the implementation, showing the flow and interactions of
-objects throughout the code generation and dynamic loading process.
-
-Engine Creation
-===============
-
-In most cases, an EngineBuilder object is used to create an instance of
-the MCJIT execution engine.  The EngineBuilder takes an llvm::Module
-object as an argument to its constructor.  The client may then set various
-options that we control the later be passed along to the MCJIT engine,
-including the selection of MCJIT as the engine type to be created.
-Of particular interest is the EngineBuilder::setMCJITMemoryManager
-function.  If the client does not explicitly create a memory manager at
-this time, a default memory manager (specifically SectionMemoryManager)
-will be created when the MCJIT engine is instantiated.
-
-Once the options have been set, a client calls EngineBuilder::create to
-create an instance of the MCJIT engine.  If the client does not use the
-form of this function that takes a TargetMachine as a parameter, a new
-TargetMachine will be created based on the target triple associated with
-the Module that was used to create the EngineBuilder.
-
-.. image:: MCJIT-engine-builder.png
- 
-EngineBuilder::create will call the static MCJIT::createJIT function,
-passing in its pointers to the module, memory manager and target machine
-objects, all of which will subsequently be owned by the MCJIT object.
-
-The MCJIT class has a member variable, Dyld, which contains an instance of
-the RuntimeDyld wrapper class.  This member will be used for
-communications between MCJIT and the actual RuntimeDyldImpl object that
-gets created when an object is loaded.
-
-.. image:: MCJIT-creation.png
- 
-Upon creation, MCJIT holds a pointer to the Module object that it received
-from EngineBuilder but it does not immediately generate code for this
-module.  Code generation is deferred until either the
-MCJIT::finalizeObject method is called explicitly or a function such as
-MCJIT::getPointerToFunction is called which requires the code to have been
-generated.
-
-Code Generation
-===============
-
-When code generation is triggered, as described above, MCJIT will first
-attempt to retrieve an object image from its ObjectCache member, if one
-has been set.  If a cached object image cannot be retrieved, MCJIT will
-call its emitObject method.  MCJIT::emitObject uses a local PassManager
-instance and creates a new ObjectBufferStream instance, both of which it
-passes to TargetMachine::addPassesToEmitMC before calling PassManager::run
-on the Module with which it was created.
-
-.. image:: MCJIT-load.png
- 
-The PassManager::run call causes the MC code generation mechanisms to emit
-a complete relocatable binary object image (either in either ELF or MachO
-format, depending on the target) into the ObjectBufferStream object, which
-is flushed to complete the process.  If an ObjectCache is being used, the
-image will be passed to the ObjectCache here.
-
-At this point, the ObjectBufferStream contains the raw object image.
-Before the code can be executed, the code and data sections from this
-image must be loaded into suitable memory, relocations must be applied and
-memory permission and code cache invalidation (if required) must be completed.
-
-Object Loading
-==============
-
-Once an object image has been obtained, either through code generation or
-having been retrieved from an ObjectCache, it is passed to RuntimeDyld to
-be loaded.  The RuntimeDyld wrapper class examines the object to determine
-its file format and creates an instance of either RuntimeDyldELF or
-RuntimeDyldMachO (both of which derive from the RuntimeDyldImpl base
-class) and calls the RuntimeDyldImpl::loadObject method to perform that
-actual loading.
-
-.. image:: MCJIT-dyld-load.png
- 
-RuntimeDyldImpl::loadObject begins by creating an ObjectImage instance
-from the ObjectBuffer it received.  ObjectImage, which wraps the
-ObjectFile class, is a helper class which parses the binary object image
-and provides access to the information contained in the format-specific
-headers, including section, symbol and relocation information.
-
-RuntimeDyldImpl::loadObject then iterates through the symbols in the
-image.  Information about common symbols is collected for later use.  For
-each function or data symbol, the associated section is loaded into memory
-and the symbol is stored in a symbol table map data structure.  When the
-iteration is complete, a section is emitted for the common symbols.
-
-Next, RuntimeDyldImpl::loadObject iterates through the sections in the
-object image and for each section iterates through the relocations for
-that sections.  For each relocation, it calls the format-specific
-processRelocationRef method, which will examine the relocation and store
-it in one of two data structures, a section-based relocation list map and
-an external symbol relocation map.
-
-.. image:: MCJIT-load-object.png
- 
-When RuntimeDyldImpl::loadObject returns, all of the code and data
-sections for the object will have been loaded into memory allocated by the
-memory manager and relocation information will have been prepared, but the
-relocations have not yet been applied and the generated code is still not
-ready to be executed.
-
-[Currently (as of August 2013) the MCJIT engine will immediately apply
-relocations when loadObject completes.  However, this shouldn't be
-happening.  Because the code may have been generated for a remote target,
-the client should be given a chance to re-map the section addresses before
-relocations are applied.  It is possible to apply relocations multiple
-times, but in the case where addresses are to be re-mapped, this first
-application is wasted effort.]
-
-Address Remapping
-=================
-
-At any time after initial code has been generated and before
-finalizeObject is called, the client can remap the address of sections in
-the object.  Typically this is done because the code was generated for an
-external process and is being mapped into that process' address space.
-The client remaps the section address by calling MCJIT::mapSectionAddress.
-This should happen before the section memory is copied to its new
-location.
-
-When MCJIT::mapSectionAddress is called, MCJIT passes the call on to
-RuntimeDyldImpl (via its Dyld member).  RuntimeDyldImpl stores the new
-address in an internal data structure but does not update the code at this
-time, since other sections are likely to change.
-
-When the client is finished remapping section addresses, it will call
-MCJIT::finalizeObject to complete the remapping process.
-
-Final Preparations
-==================
-
-When MCJIT::finalizeObject is called, MCJIT calls
-RuntimeDyld::resolveRelocations.  This function will attempt to locate any
-external symbols and then apply all relocations for the object.
-
-External symbols are resolved by calling the memory manager's
-getPointerToNamedFunction method.  The memory manager will return the
-address of the requested symbol in the target address space.  (Note, this
-may not be a valid pointer in the host process.)  RuntimeDyld will then
-iterate through the list of relocations it has stored which are associated
-with this symbol and invoke the resolveRelocation method which, through an
-format-specific implementation, will apply the relocation to the loaded
-section memory.
-
-Next, RuntimeDyld::resolveRelocations iterates through the list of
-sections and for each section iterates through a list of relocations that
-have been saved which reference that symbol and call resolveRelocation for
-each entry in this list.  The relocation list here is a list of
-relocations for which the symbol associated with the relocation is located
-in the section associated with the list.  Each of these locations will
-have a target location at which the relocation will be applied that is
-likely located in a different section.
-
-.. image:: MCJIT-resolve-relocations.png
- 
-Once relocations have been applied as described above, MCJIT calls
-RuntimeDyld::getEHFrameSection, and if a non-zero result is returned
-passes the section data to the memory manager's registerEHFrames method.
-This allows the memory manager to call any desired target-specific
-functions, such as registering the EH frame information with a debugger.
-
-Finally, MCJIT calls the memory manager's finalizeMemory method.  In this
-method, the memory manager will invalidate the target code cache, if
-necessary, and apply final permissions to the memory pages it has
-allocated for code and data memory.
-
+===============================
+MCJIT Design and Implementation
+===============================
+
+Introduction
+============
+
+This document describes the internal workings of the MCJIT execution
+engine and the RuntimeDyld component.  It is intended as a high level
+overview of the implementation, showing the flow and interactions of
+objects throughout the code generation and dynamic loading process.
+
+Engine Creation
+===============
+
+In most cases, an EngineBuilder object is used to create an instance of
+the MCJIT execution engine.  The EngineBuilder takes an llvm::Module
+object as an argument to its constructor.  The client may then set various
+options that we control the later be passed along to the MCJIT engine,
+including the selection of MCJIT as the engine type to be created.
+Of particular interest is the EngineBuilder::setMCJITMemoryManager
+function.  If the client does not explicitly create a memory manager at
+this time, a default memory manager (specifically SectionMemoryManager)
+will be created when the MCJIT engine is instantiated.
+
+Once the options have been set, a client calls EngineBuilder::create to
+create an instance of the MCJIT engine.  If the client does not use the
+form of this function that takes a TargetMachine as a parameter, a new
+TargetMachine will be created based on the target triple associated with
+the Module that was used to create the EngineBuilder.
+
+.. image:: MCJIT-engine-builder.png
+ 
+EngineBuilder::create will call the static MCJIT::createJIT function,
+passing in its pointers to the module, memory manager and target machine
+objects, all of which will subsequently be owned by the MCJIT object.
+
+The MCJIT class has a member variable, Dyld, which contains an instance of
+the RuntimeDyld wrapper class.  This member will be used for
+communications between MCJIT and the actual RuntimeDyldImpl object that
+gets created when an object is loaded.
+
+.. image:: MCJIT-creation.png
+ 
+Upon creation, MCJIT holds a pointer to the Module object that it received
+from EngineBuilder but it does not immediately generate code for this
+module.  Code generation is deferred until either the
+MCJIT::finalizeObject method is called explicitly or a function such as
+MCJIT::getPointerToFunction is called which requires the code to have been
+generated.
+
+Code Generation
+===============
+
+When code generation is triggered, as described above, MCJIT will first
+attempt to retrieve an object image from its ObjectCache member, if one
+has been set.  If a cached object image cannot be retrieved, MCJIT will
+call its emitObject method.  MCJIT::emitObject uses a local PassManager
+instance and creates a new ObjectBufferStream instance, both of which it
+passes to TargetMachine::addPassesToEmitMC before calling PassManager::run
+on the Module with which it was created.
+
+.. image:: MCJIT-load.png
+ 
+The PassManager::run call causes the MC code generation mechanisms to emit
+a complete relocatable binary object image (either in either ELF or MachO
+format, depending on the target) into the ObjectBufferStream object, which
+is flushed to complete the process.  If an ObjectCache is being used, the
+image will be passed to the ObjectCache here.
+
+At this point, the ObjectBufferStream contains the raw object image.
+Before the code can be executed, the code and data sections from this
+image must be loaded into suitable memory, relocations must be applied and
+memory permission and code cache invalidation (if required) must be completed.
+
+Object Loading
+==============
+
+Once an object image has been obtained, either through code generation or
+having been retrieved from an ObjectCache, it is passed to RuntimeDyld to
+be loaded.  The RuntimeDyld wrapper class examines the object to determine
+its file format and creates an instance of either RuntimeDyldELF or
+RuntimeDyldMachO (both of which derive from the RuntimeDyldImpl base
+class) and calls the RuntimeDyldImpl::loadObject method to perform that
+actual loading.
+
+.. image:: MCJIT-dyld-load.png
+ 
+RuntimeDyldImpl::loadObject begins by creating an ObjectImage instance
+from the ObjectBuffer it received.  ObjectImage, which wraps the
+ObjectFile class, is a helper class which parses the binary object image
+and provides access to the information contained in the format-specific
+headers, including section, symbol and relocation information.
+
+RuntimeDyldImpl::loadObject then iterates through the symbols in the
+image.  Information about common symbols is collected for later use.  For
+each function or data symbol, the associated section is loaded into memory
+and the symbol is stored in a symbol table map data structure.  When the
+iteration is complete, a section is emitted for the common symbols.
+
+Next, RuntimeDyldImpl::loadObject iterates through the sections in the
+object image and for each section iterates through the relocations for
+that sections.  For each relocation, it calls the format-specific
+processRelocationRef method, which will examine the relocation and store
+it in one of two data structures, a section-based relocation list map and
+an external symbol relocation map.
+
+.. image:: MCJIT-load-object.png
+ 
+When RuntimeDyldImpl::loadObject returns, all of the code and data
+sections for the object will have been loaded into memory allocated by the
+memory manager and relocation information will have been prepared, but the
+relocations have not yet been applied and the generated code is still not
+ready to be executed.
+
+[Currently (as of August 2013) the MCJIT engine will immediately apply
+relocations when loadObject completes.  However, this shouldn't be
+happening.  Because the code may have been generated for a remote target,
+the client should be given a chance to re-map the section addresses before
+relocations are applied.  It is possible to apply relocations multiple
+times, but in the case where addresses are to be re-mapped, this first
+application is wasted effort.]
+
+Address Remapping
+=================
+
+At any time after initial code has been generated and before
+finalizeObject is called, the client can remap the address of sections in
+the object.  Typically this is done because the code was generated for an
+external process and is being mapped into that process' address space.
+The client remaps the section address by calling MCJIT::mapSectionAddress.
+This should happen before the section memory is copied to its new
+location.
+
+When MCJIT::mapSectionAddress is called, MCJIT passes the call on to
+RuntimeDyldImpl (via its Dyld member).  RuntimeDyldImpl stores the new
+address in an internal data structure but does not update the code at this
+time, since other sections are likely to change.
+
+When the client is finished remapping section addresses, it will call
+MCJIT::finalizeObject to complete the remapping process.
+
+Final Preparations
+==================
+
+When MCJIT::finalizeObject is called, MCJIT calls
+RuntimeDyld::resolveRelocations.  This function will attempt to locate any
+external symbols and then apply all relocations for the object.
+
+External symbols are resolved by calling the memory manager's
+getPointerToNamedFunction method.  The memory manager will return the
+address of the requested symbol in the target address space.  (Note, this
+may not be a valid pointer in the host process.)  RuntimeDyld will then
+iterate through the list of relocations it has stored which are associated
+with this symbol and invoke the resolveRelocation method which, through an
+format-specific implementation, will apply the relocation to the loaded
+section memory.
+
+Next, RuntimeDyld::resolveRelocations iterates through the list of
+sections and for each section iterates through a list of relocations that
+have been saved which reference that symbol and call resolveRelocation for
+each entry in this list.  The relocation list here is a list of
+relocations for which the symbol associated with the relocation is located
+in the section associated with the list.  Each of these locations will
+have a target location at which the relocation will be applied that is
+likely located in a different section.
+
+.. image:: MCJIT-resolve-relocations.png
+ 
+Once relocations have been applied as described above, MCJIT calls
+RuntimeDyld::getEHFrameSection, and if a non-zero result is returned
+passes the section data to the memory manager's registerEHFrames method.
+This allows the memory manager to call any desired target-specific
+functions, such as registering the EH frame information with a debugger.
+
+Finally, MCJIT calls the memory manager's finalizeMemory method.  In this
+method, the memory manager will invalidate the target code cache, if
+necessary, and apply final permissions to the memory pages it has
+allocated for code and data memory.
+
--- a/include/llvm/Analysis/MemoryBuiltins.h
+++ b/include/llvm/Analysis/MemoryBuiltins.h
@ -59,11 +59,6 @@ bool isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
 bool isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                   bool LookThroughBitCast = false);

-/// \brief Tests if a value is a call or invoke to a library function that
-/// allocates memory and never returns null (such as operator new).
-bool isOperatorNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                         bool LookThroughBitCast = false);
-
 //===----------------------------------------------------------------------===//
 //  malloc Call Utility Functions.
 //
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@ -97,7 +97,7 @@ private:
  // of memory operands required to be precise exceeds the maximum value of
  // NumMemRefs - currently 256 - we remove the operands entirely. Note also
  // that this is a non-owning reference to a shared copy on write buffer owned
-  // by the MachineFunction and created via MF.allocateMemRefsArray. 
+  // by the MachineFunction and created via MF.allocateMemRefsArray.
  mmo_iterator MemRefs;

  DebugLoc debugLoc;                    // Source line information.
@ -354,7 +354,7 @@ public:
  mmo_iterator memoperands_end() const { return MemRefs + NumMemRefs; }
  /// Return true if we don't have any memory operands which described the the
  /// memory access done by this instruction.  If this is true, calling code
-  /// must be conservative.    
+  /// must be conservative.
  bool memoperands_empty() const { return NumMemRefs == 0; }

  iterator_range<mmo_iterator>  memoperands() {
@ -774,7 +774,7 @@ public:
  bool isKill() const { return getOpcode() == TargetOpcode::KILL; }
  bool isImplicitDef() const { return getOpcode()==TargetOpcode::IMPLICIT_DEF; }
  bool isInlineAsm() const { return getOpcode() == TargetOpcode::INLINEASM; }
-  bool isMSInlineAsm() const { 
+  bool isMSInlineAsm() const {
    return getOpcode() == TargetOpcode::INLINEASM && getInlineAsmDialect();
  }
  bool isStackAligningInlineAsm() const;
@ -1180,11 +1180,26 @@ public:
  /// Assign this MachineInstr's memory reference descriptor list.
  /// This does not transfer ownership.
  void setMemRefs(mmo_iterator NewMemRefs, mmo_iterator NewMemRefsEnd) {
-    MemRefs = NewMemRefs;
-    NumMemRefs = uint8_t(NewMemRefsEnd - NewMemRefs);
-    assert(NumMemRefs == NewMemRefsEnd - NewMemRefs && "Too many memrefs");
+    setMemRefs(std::make_pair(NewMemRefs, NewMemRefsEnd-NewMemRefs));
  }

+  /// Assign this MachineInstr's memory reference descriptor list.  First
+  /// element in the pair is the begin iterator/pointer to the array; the
+  /// second is the number of MemoryOperands.  This does not transfer ownership
+  /// of the underlying memory.
+  void setMemRefs(std::pair<mmo_iterator, unsigned> NewMemRefs) {
+    MemRefs = NewMemRefs.first;
+    NumMemRefs = uint8_t(NewMemRefs.second);
+    assert(NumMemRefs == NewMemRefs.second &&
+           "Too many memrefs - must drop memory operands");
+  }
+
+  /// Return a set of memrefs (begin iterator, size) which conservatively
+  /// describe the memory behavior of both MachineInstrs.  This is appropriate
+  /// for use when merging two MachineInstrs into one. This routine does not
+  /// modify the memrefs of the this MachineInstr.
+  std::pair<mmo_iterator, unsigned> mergeMemRefsWith(const MachineInstr& Other);
+
  /// Clear this MachineInstr's memory reference descriptor list.  This resets
  /// the memrefs to their most conservative state.  This should be used only
  /// as a last resort since it greatly pessimizes our knowledge of the memory
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@ -162,6 +162,11 @@ public:
    return *this;
  }

+  const MachineInstrBuilder &setMemRefs(std::pair<MachineInstr::mmo_iterator,
+                                        unsigned> MemOperandsRef) const {
+    MI->setMemRefs(MemOperandsRef);
+    return *this;
+  }

  const MachineInstrBuilder &addOperand(const MachineOperand &MO) const {
    MI->addOperand(*MF, MO);
--- a/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/include/llvm/CodeGen/MachineInstrBundle.h
@ -178,7 +178,7 @@ public:
    /// register.
    bool FullyDefined;

-    /// Reg or ont of its aliases is read. The register may only be read
+    /// Reg or one of its aliases is read. The register may only be read
    /// partially.
    bool Read;
    /// Reg or a super-register is read. The full register is read.
--- a/include/llvm/CodeGen/WinEHFuncInfo.h
+++ b/include/llvm/CodeGen/WinEHFuncInfo.h
@ -83,7 +83,9 @@ enum class ClrHandlerType { Catch, Finally, Fault, Filter };
 struct ClrEHUnwindMapEntry {
  MBBOrBasicBlock Handler;
  uint32_t TypeToken;
-  int Parent;
+  int HandlerParentState; ///< Outer handler enclosing this entry's handler
+  int TryParentState; ///< Outer try region enclosing this entry's try region,
+                      ///< treating later catches on same try as "outer"
  ClrHandlerType HandlerType;
 };

--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h
@ -310,6 +310,11 @@ public:
    CALLSITE_DELEGATE_GETTER(hasFnAttr(A));
  }

+  /// \brief Return true if this function has the given attribute.
+  bool hasFnAttr(StringRef A) const {
+    CALLSITE_DELEGATE_GETTER(hasFnAttr(A));
+  }
+
  /// \brief Return true if the call or the callee has the given attribute.
  bool paramHasAttr(unsigned i, Attribute::AttrKind A) const {
    CALLSITE_DELEGATE_GETTER(paramHasAttr(i, A));
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@ -61,9 +61,13 @@ protected:
  MDNode *DefaultFPMathTag;
  FastMathFlags FMF;

+  ArrayRef<OperandBundleDef> DefaultOperandBundles;
+
 public:
-  IRBuilderBase(LLVMContext &context, MDNode *FPMathTag = nullptr)
-    : Context(context), DefaultFPMathTag(FPMathTag), FMF() {
+  IRBuilderBase(LLVMContext &context, MDNode *FPMathTag = nullptr,
+                ArrayRef<OperandBundleDef> OpBundles = None)
+      : Context(context), DefaultFPMathTag(FPMathTag), FMF(),
+        DefaultOperandBundles(OpBundles) {
    ClearInsertionPoint();
  }

@ -538,37 +542,44 @@ class IRBuilder : public IRBuilderBase, public Inserter {

 public:
  IRBuilder(LLVMContext &C, const T &F, Inserter I = Inserter(),
-            MDNode *FPMathTag = nullptr)
-      : IRBuilderBase(C, FPMathTag), Inserter(std::move(I)), Folder(F) {}
+            MDNode *FPMathTag = nullptr,
+            ArrayRef<OperandBundleDef> OpBundles = None)
+      : IRBuilderBase(C, FPMathTag, OpBundles), Inserter(std::move(I)),
+        Folder(F) {}

-  explicit IRBuilder(LLVMContext &C, MDNode *FPMathTag = nullptr)
-    : IRBuilderBase(C, FPMathTag), Folder() {
-  }
+  explicit IRBuilder(LLVMContext &C, MDNode *FPMathTag = nullptr,
+                     ArrayRef<OperandBundleDef> OpBundles = None)
+      : IRBuilderBase(C, FPMathTag, OpBundles), Folder() {}

-  explicit IRBuilder(BasicBlock *TheBB, const T &F, MDNode *FPMathTag = nullptr)
-    : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder(F) {
+  explicit IRBuilder(BasicBlock *TheBB, const T &F, MDNode *FPMathTag = nullptr,
+                     ArrayRef<OperandBundleDef> OpBundles = None)
+      : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles), Folder(F) {
    SetInsertPoint(TheBB);
  }

-  explicit IRBuilder(BasicBlock *TheBB, MDNode *FPMathTag = nullptr)
-    : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder() {
+  explicit IRBuilder(BasicBlock *TheBB, MDNode *FPMathTag = nullptr,
+                     ArrayRef<OperandBundleDef> OpBundles = None)
+      : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles), Folder() {
    SetInsertPoint(TheBB);
  }

-  explicit IRBuilder(Instruction *IP, MDNode *FPMathTag = nullptr)
-    : IRBuilderBase(IP->getContext(), FPMathTag), Folder() {
+  explicit IRBuilder(Instruction *IP, MDNode *FPMathTag = nullptr,
+                     ArrayRef<OperandBundleDef> OpBundles = None)
+      : IRBuilderBase(IP->getContext(), FPMathTag, OpBundles), Folder() {
    SetInsertPoint(IP);
  }

-  IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP, const T& F,
-            MDNode *FPMathTag = nullptr)
-    : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder(F) {
+  IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP, const T &F,
+            MDNode *FPMathTag = nullptr,
+            ArrayRef<OperandBundleDef> OpBundles = None)
+      : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles), Folder(F) {
    SetInsertPoint(TheBB, IP);
  }

  IRBuilder(BasicBlock *TheBB, BasicBlock::iterator IP,
-            MDNode *FPMathTag = nullptr)
-    : IRBuilderBase(TheBB->getContext(), FPMathTag), Folder() {
+            MDNode *FPMathTag = nullptr,
+            ArrayRef<OperandBundleDef> OpBundles = None)
+      : IRBuilderBase(TheBB->getContext(), FPMathTag, OpBundles), Folder() {
    SetInsertPoint(TheBB, IP);
  }

@ -1529,8 +1540,11 @@ public:

  CallInst *CreateCall(Value *Callee, ArrayRef<Value *> Args = None,
                       ArrayRef<OperandBundleDef> OpBundles = None,
-                       const Twine &Name = "") {
-    return Insert(CallInst::Create(Callee, Args, OpBundles), Name);
+                       const Twine &Name = "", MDNode *FPMathTag = nullptr) {
+    CallInst *CI = CallInst::Create(Callee, Args, OpBundles);
+    if (isa<FPMathOperator>(CI))
+      CI = cast<CallInst>(AddFPMathAttributes(CI, FPMathTag, FMF));
+    return Insert(CI, Name);
  }

  CallInst *CreateCall(Value *Callee, ArrayRef<Value *> Args,
@ -1543,7 +1557,7 @@ public:
  CallInst *CreateCall(llvm::FunctionType *FTy, Value *Callee,
                       ArrayRef<Value *> Args, const Twine &Name = "",
                       MDNode *FPMathTag = nullptr) {
-    CallInst *CI = CallInst::Create(FTy, Callee, Args);
+    CallInst *CI = CallInst::Create(FTy, Callee, Args, DefaultOperandBundles);
    if (isa<FPMathOperator>(CI))
      CI = cast<CallInst>(AddFPMathAttributes(CI, FPMathTag, FMF));
    return Insert(CI, Name);
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@ -3550,6 +3550,11 @@ public:
    return hasFnAttrImpl(A);
  }

+  /// \brief Determine whether this call has the given attribute.
+  bool hasFnAttr(StringRef A) const {
+    return hasFnAttrImpl(A);
+  }
+
  /// \brief Determine whether the call or the callee has the given attributes.
  bool paramHasAttr(unsigned i, Attribute::AttrKind A) const;

@ -3734,7 +3739,19 @@ private:
  unsigned getNumSuccessorsV() const override;
  void setSuccessorV(unsigned idx, BasicBlock *B) override;

-  bool hasFnAttrImpl(Attribute::AttrKind A) const;
+  template <typename AttrKind> bool hasFnAttrImpl(AttrKind A) const {
+    if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, A))
+      return true;
+
+    // Operand bundles override attributes on the called function, but don't
+    // override attributes directly present on the invoke instruction.
+    if (isFnAttrDisallowedByOpBundle(A))
+      return false;
+
+    if (const Function *F = getCalledFunction())
+      return F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, A);
+    return false;
+  }

  // Shadow Instruction::setInstructionSubclassData with a private forwarding
  // method so that subclasses cannot accidentally use it.
@ -3966,6 +3983,8 @@ public:
  /// point to the added handler.
  void addHandler(BasicBlock *Dest);

+  void removeHandler(handler_iterator HI);
+
  unsigned getNumSuccessors() const { return getNumOperands() - 1; }
  BasicBlock *getSuccessor(unsigned Idx) const {
    assert(Idx < getNumSuccessors() &&
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@ -32,6 +32,19 @@ let TargetPrefix = "x86" in {
                                        [IntrNoMem]>;
 }

+//===----------------------------------------------------------------------===//
+// FLAGS.
+let TargetPrefix = "x86" in {
+  def int_x86_flags_read_u32 : GCCBuiltin<"__builtin_ia32_readeflags_u32">,
+        Intrinsic<[llvm_i32_ty], [], []>;
+  def int_x86_flags_read_u64 : GCCBuiltin<"__builtin_ia32_readeflags_u64">,
+        Intrinsic<[llvm_i64_ty], [], []>;
+  def int_x86_flags_write_u32 : GCCBuiltin<"__builtin_ia32_writeeflags_u32">,
+        Intrinsic<[], [llvm_i32_ty], []>;
+  def int_x86_flags_write_u64 : GCCBuiltin<"__builtin_ia32_writeeflags_u64">,
+        Intrinsic<[], [llvm_i64_ty], []>;
+}
+
 //===----------------------------------------------------------------------===//
 // Read Time Stamp Counter.
 let TargetPrefix = "x86" in {
@ -2211,6 +2224,25 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
                         llvm_i8_ty, llvm_v32i16_ty,  llvm_i32_ty], [IntrNoMem]>;

+  def int_x86_avx512_mask_psra_w_128 : GCCBuiltin<"__builtin_ia32_psraw128_mask">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
+                         llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_w_256 : GCCBuiltin<"__builtin_ia32_psraw256_mask">,
+              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
+                         llvm_v8i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_w_512 : GCCBuiltin<"__builtin_ia32_psraw512_mask">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
+                         llvm_v8i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_wi_128 : GCCBuiltin<"__builtin_ia32_psrawi128_mask">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
+                         llvm_i8_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_wi_256 : GCCBuiltin<"__builtin_ia32_psrawi256_mask">,
+              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
+                         llvm_i8_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_wi_512 : GCCBuiltin<"__builtin_ia32_psrawi512_mask">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
+                         llvm_i8_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+
  def int_x86_avx512_mask_psll_d : GCCBuiltin<"__builtin_ia32_pslld512_mask">,
              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
                         llvm_v4i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
@ -2229,6 +2261,69 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
  def int_x86_avx512_mask_psra_q : GCCBuiltin<"__builtin_ia32_psraq512_mask">,
              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
                         llvm_v2i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_psra_d_128 : GCCBuiltin<"__builtin_ia32_psrad128_mask">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
+                         llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_d_256 : GCCBuiltin<"__builtin_ia32_psrad256_mask">,
+             Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, 
+                         llvm_v4i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_di_128 : GCCBuiltin<"__builtin_ia32_psradi128_mask">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, 
+                         llvm_i8_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_di_256 : GCCBuiltin<"__builtin_ia32_psradi256_mask">,
+              Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, 
+                        llvm_i8_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_di_512 : GCCBuiltin<"__builtin_ia32_psradi512_mask">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
+                         llvm_i8_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_q_128 : GCCBuiltin<"__builtin_ia32_psraq128_mask">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, 
+                         llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_q_256 : GCCBuiltin<"__builtin_ia32_psraq256_mask">,
+             Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, 
+                         llvm_v2i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_qi_128 : GCCBuiltin<"__builtin_ia32_psraqi128_mask">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, 
+                         llvm_i8_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_qi_256 : GCCBuiltin<"__builtin_ia32_psraqi256_mask">,
+             Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, 
+                         llvm_i8_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psra_qi_512 : GCCBuiltin<"__builtin_ia32_psraqi512_mask">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, 
+                         llvm_i8_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_psrl_d_128: GCCBuiltin<"__builtin_ia32_psrld128_mask">,
+              Intrinsic<[llvm_v4i32_ty], [ llvm_v4i32_ty,
+                         llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty ], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrl_d_256: GCCBuiltin<"__builtin_ia32_psrld256_mask">,
+              Intrinsic<[llvm_v8i32_ty], [ llvm_v8i32_ty, 
+                         llvm_v4i32_ty, llvm_v8i32_ty, llvm_i8_ty ], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrl_di_128: GCCBuiltin<"__builtin_ia32_psrldi128_mask">,
+              Intrinsic<[llvm_v4i32_ty], [ llvm_v4i32_ty, 
+                         llvm_i8_ty, llvm_v4i32_ty, llvm_i8_ty ], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrl_di_256: GCCBuiltin<"__builtin_ia32_psrldi256_mask">,
+              Intrinsic<[llvm_v8i32_ty], [ llvm_v8i32_ty, 
+                         llvm_i8_ty, llvm_v8i32_ty, llvm_i8_ty ], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrl_di_512: GCCBuiltin<"__builtin_ia32_psrldi512_mask">,
+              Intrinsic<[llvm_v16i32_ty], [ llvm_v16i32_ty, 
+                         llvm_i8_ty, llvm_v16i32_ty, llvm_i16_ty ], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_psrl_q_128:  GCCBuiltin<"__builtin_ia32_psrlq128_mask">,
+        Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, 
+                   llvm_v2i64_ty, llvm_v2i64_ty,  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrl_q_256: GCCBuiltin<"__builtin_ia32_psrlq256_mask">,
+        Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,  
+                   llvm_v2i64_ty, llvm_v4i64_ty,  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrl_qi_128: GCCBuiltin<"__builtin_ia32_psrlqi128_mask">,
+        Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
+                   llvm_i8_ty, llvm_v2i64_ty,  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrl_qi_256: GCCBuiltin<"__builtin_ia32_psrlqi256_mask">,
+        Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
+                   llvm_i8_ty, llvm_v4i64_ty,  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrl_qi_512: GCCBuiltin<"__builtin_ia32_psrlqi512_mask">,
+        Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, 
+                   llvm_i8_ty, llvm_v8i64_ty,  llvm_i8_ty], [IntrNoMem]>;
 }

 // Pack ops.
@ -2696,6 +2791,59 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
  def int_x86_avx512_psrl_dq_512 : GCCBuiltin<"__builtin_ia32_psrldq512">,
              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i32_ty], 
                        [IntrNoMem]>;                        
+
+  def int_x86_avx512_mask_psll_d_128 : GCCBuiltin<"__builtin_ia32_pslld128_mask">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
+                         llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psll_d_256 : GCCBuiltin<"__builtin_ia32_pslld256_mask">,
+              Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, 
+                         llvm_v4i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psll_di_128 : GCCBuiltin<"__builtin_ia32_pslldi128_mask">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, 
+                         llvm_i8_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psll_di_256 : GCCBuiltin<"__builtin_ia32_pslldi256_mask">,
+              Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, 
+                         llvm_i8_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psll_di_512 : GCCBuiltin<"__builtin_ia32_pslldi512_mask">,
+              Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, 
+                         llvm_i8_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psll_q_128 : GCCBuiltin<"__builtin_ia32_psllq128_mask">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
+                         llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psll_q_256 : GCCBuiltin<"__builtin_ia32_psllq256_mask">,
+              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, 
+                         llvm_v2i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psll_qi_128 : GCCBuiltin<"__builtin_ia32_psllqi128_mask">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, 
+                         llvm_i8_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psll_qi_256 : GCCBuiltin<"__builtin_ia32_psllqi256_mask">,
+              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, 
+                         llvm_i8_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psll_qi_512 : GCCBuiltin<"__builtin_ia32_psllqi512_mask">,
+              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, 
+                         llvm_i8_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+
+  def int_x86_avx512_mask_psrlv16_hi : GCCBuiltin<"__builtin_ia32_psrlv16hi_mask">,
+              Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
+                         llvm_v16i16_ty, llvm_v16i16_ty,  llvm_i16_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrlv2_di : GCCBuiltin<"__builtin_ia32_psrlv2di_mask">,
+              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
+                         llvm_v2i64_ty, llvm_v2i64_ty,  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrlv32hi : GCCBuiltin<"__builtin_ia32_psrlv32hi_mask">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, 
+                         llvm_v32i16_ty, llvm_v32i16_ty,  llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrlv4_di : GCCBuiltin<"__builtin_ia32_psrlv4di_mask">,
+              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, 
+                         llvm_v4i64_ty, llvm_v4i64_ty,  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrlv4_si : GCCBuiltin<"__builtin_ia32_psrlv4si_mask">,
+              Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, 
+                         llvm_v4i32_ty, llvm_v4i32_ty,  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrlv8_hi : GCCBuiltin<"__builtin_ia32_psrlv8hi_mask">,
+              Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, 
+                         llvm_v8i16_ty, llvm_v8i16_ty,  llvm_i8_ty], [IntrNoMem]>;
+  def int_x86_avx512_mask_psrlv8_si : GCCBuiltin<"__builtin_ia32_psrlv8si_mask">,
+              Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, 
+                         llvm_v8i32_ty, llvm_v8i32_ty,  llvm_i8_ty], [IntrNoMem]>;
 }

 // Gather ops
@ -3919,9 +4067,9 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Support protection key
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
  def int_x86_rdpkru : GCCBuiltin <"__builtin_ia32_rdpkru">,
-              Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
+              Intrinsic<[llvm_i32_ty], [], []>;
  def int_x86_wrpkru : GCCBuiltin<"__builtin_ia32_wrpkru">,
-              Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
+              Intrinsic<[], [llvm_i32_ty], []>;
 }
 //===----------------------------------------------------------------------===//
 // Half float conversion
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@ -283,14 +283,20 @@ private:
  LLVMContext &Context;
  uint64_t NextIndex;
  SmallDenseMap<void *, std::pair<OwnerTy, uint64_t>, 4> UseMap;
+  /// Flag that can be set to false if this metadata should not be
+  /// RAUW'ed, e.g. if it is used as the key of a map.
+  bool CanReplace;

 public:
  ReplaceableMetadataImpl(LLVMContext &Context)
-      : Context(Context), NextIndex(0) {}
+      : Context(Context), NextIndex(0), CanReplace(true) {}
  ~ReplaceableMetadataImpl() {
    assert(UseMap.empty() && "Cannot destroy in-use replaceable metadata");
  }

+  /// Set the CanReplace flag to the given value.
+  void setCanReplace(bool Replaceable) { CanReplace = Replaceable; }
+
  LLVMContext &getContext() const { return Context; }

  /// \brief Replace all uses of this with MD.
@ -901,14 +907,19 @@ public:
    Context.getReplaceableUses()->replaceAllUsesWith(MD);
  }

+  /// Set the CanReplace flag to the given value.
+  void setCanReplace(bool Replaceable) {
+    Context.getReplaceableUses()->setCanReplace(Replaceable);
+  }
+
  /// \brief Resolve cycles.
  ///
  /// Once all forward declarations have been resolved, force cycles to be
-  /// resolved. If \p MDMaterialized is true, then any temporary metadata
+  /// resolved. If \p AllowTemps is true, then any temporary metadata
  /// is ignored, otherwise it asserts when encountering temporary metadata.
  ///
  /// \pre No operands (or operands' operands, etc.) have \a isTemporary().
-  void resolveCycles(bool MDMaterialized = true);
+  void resolveCycles(bool AllowTemps = false);

  /// \brief Replace a temporary node with a permanent one.
  ///
--- a/include/llvm/IR/Statepoint.h
+++ b/include/llvm/IR/Statepoint.h
@ -22,6 +22,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Compiler.h"

@ -36,14 +37,13 @@ enum class StatepointFlags {
  MaskAll = GCTransition ///< A bitmask that includes all valid flags.
 };

-class GCRelocateOperands;
+class GCRelocateInst;
 class ImmutableStatepoint;

 bool isStatepoint(const ImmutableCallSite &CS);
 bool isStatepoint(const Value *V);
 bool isStatepoint(const Value &V);

-bool isGCRelocate(const Value *V);
 bool isGCRelocate(const ImmutableCallSite &CS);

 bool isGCResult(const Value *V);
@ -247,7 +247,7 @@ public:
  /// May contain several relocations for the same base/derived pair.
  /// For example this could happen due to relocations on unwinding
  /// path of invoke.
-  std::vector<GCRelocateOperands> getRelocates() const;
+  std::vector<const GCRelocateInst *> getRelocates() const;

  /// Get the experimental_gc_result call tied to this statepoint.  Can be
  /// nullptr if there isn't a gc_result tied to this statepoint.  Guaranteed to
@ -305,33 +305,27 @@ public:
  explicit Statepoint(CallSite CS) : Base(CS) {}
 };

-/// Wraps a call to a gc.relocate and provides access to it's operands.
-/// TODO: This should likely be refactored to resememble the wrappers in
-/// InstrinsicInst.h.
-class GCRelocateOperands {
-  ImmutableCallSite RelocateCS;
-
+/// This represents the gc.relocate intrinsic.
+class GCRelocateInst : public IntrinsicInst {
 public:
-  GCRelocateOperands(const User *U) : RelocateCS(U) { assert(isGCRelocate(U)); }
-  GCRelocateOperands(const Instruction *inst) : RelocateCS(inst) {
-    assert(isGCRelocate(inst));
+  static inline bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::experimental_gc_relocate;
+  }
+  static inline bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
  }
-  GCRelocateOperands(CallSite CS) : RelocateCS(CS) { assert(isGCRelocate(CS)); }

  /// Return true if this relocate is tied to the invoke statepoint.
  /// This includes relocates which are on the unwinding path.
  bool isTiedToInvoke() const {
-    const Value *Token = RelocateCS.getArgument(0);
+    const Value *Token = getArgOperand(0);

    return isa<LandingPadInst>(Token) || isa<InvokeInst>(Token);
  }

-  /// Get enclosed relocate intrinsic
-  ImmutableCallSite getUnderlyingCallSite() { return RelocateCS; }
-
  /// The statepoint with which this gc.relocate is associated.
-  const Instruction *getStatepoint() {
-    const Value *Token = RelocateCS.getArgument(0);
+  const Instruction *getStatepoint() const {
+    const Value *Token = getArgOperand(0);

    // This takes care both of relocates for call statepoints and relocates
    // on normal path of invoke statepoint.
@ -354,22 +348,22 @@ public:
  /// The index into the associate statepoint's argument list
  /// which contains the base pointer of the pointer whose
  /// relocation this gc.relocate describes.
-  unsigned getBasePtrIndex() {
-    return cast<ConstantInt>(RelocateCS.getArgument(1))->getZExtValue();
+  unsigned getBasePtrIndex() const {
+    return cast<ConstantInt>(getArgOperand(1))->getZExtValue();
  }

  /// The index into the associate statepoint's argument list which
  /// contains the pointer whose relocation this gc.relocate describes.
-  unsigned getDerivedPtrIndex() {
-    return cast<ConstantInt>(RelocateCS.getArgument(2))->getZExtValue();
+  unsigned getDerivedPtrIndex() const {
+    return cast<ConstantInt>(getArgOperand(2))->getZExtValue();
  }

-  Value *getBasePtr() {
+  Value *getBasePtr() const {
    ImmutableCallSite CS(getStatepoint());
    return *(CS.arg_begin() + getBasePtrIndex());
  }

-  Value *getDerivedPtr() {
+  Value *getDerivedPtr() const {
    ImmutableCallSite CS(getStatepoint());
    return *(CS.arg_begin() + getDerivedPtrIndex());
  }
@ -377,11 +371,11 @@ public:

 template <typename FunTy, typename InstructionTy, typename ValueTy,
          typename CallSiteTy>
-std::vector<GCRelocateOperands>
+std::vector<const GCRelocateInst *>
 StatepointBase<FunTy, InstructionTy, ValueTy, CallSiteTy>::getRelocates()
    const {

-  std::vector<GCRelocateOperands> Result;
+  std::vector<const GCRelocateInst *> Result;

  CallSiteTy StatepointCS = getCallSite();

@ -389,8 +383,8 @@ StatepointBase<FunTy, InstructionTy, ValueTy, CallSiteTy>::getRelocates()
  // gc_relocates ensures that we only get pairs which are actually relocated
  // and used after the statepoint.
  for (const User *U : getInstruction()->users())
-    if (isGCRelocate(U))
-      Result.push_back(GCRelocateOperands(U));
+    if (auto *Relocate = dyn_cast<GCRelocateInst>(U))
+      Result.push_back(Relocate);

  if (!StatepointCS.isInvoke())
    return Result;
@ -401,8 +395,8 @@ StatepointBase<FunTy, InstructionTy, ValueTy, CallSiteTy>::getRelocates()

  // Search for gc relocates that are attached to this landingpad.
  for (const User *LandingPadUser : LandingPad->users()) {
-    if (isGCRelocate(LandingPadUser))
-      Result.push_back(GCRelocateOperands(LandingPadUser));
+    if (auto *Relocate = dyn_cast<GCRelocateInst>(LandingPadUser))
+      Result.push_back(Relocate);
  }
  return Result;
 }
--- a/include/llvm/MC/SubtargetFeature.h
+++ b/include/llvm/MC/SubtargetFeature.h
@ -39,8 +39,8 @@ public:
  FeatureBitset(const bitset<MAX_SUBTARGET_FEATURES>& B) : bitset(B) {}

  FeatureBitset(std::initializer_list<unsigned> Init) : bitset() {
-    for (auto I = Init.begin() , E = Init.end(); I != E; ++I)
-      set(*I);
+    for (auto I : Init)
+      set(I);
  }
 };

@ -59,6 +59,11 @@ struct SubtargetFeatureKV {
  bool operator<(StringRef S) const {
    return StringRef(Key) < S;
  }
+
+  // Compare routine for std::is_sorted.
+  bool operator<(const SubtargetFeatureKV &Other) const {
+    return StringRef(Key) < StringRef(Other.Key);
+  }
 };

 //===----------------------------------------------------------------------===//
@ -98,14 +103,13 @@ public:
  /// Adding Features.
  void AddFeature(StringRef String, bool Enable = true);

-  /// ToggleFeature - Toggle a feature and returns the newly updated feature
-  /// bits.
-  FeatureBitset ToggleFeature(FeatureBitset Bits, StringRef String,
-                         ArrayRef<SubtargetFeatureKV> FeatureTable);
+  /// ToggleFeature - Toggle a feature and update the feature bits.
+  static void ToggleFeature(FeatureBitset &Bits, StringRef String,
+                            ArrayRef<SubtargetFeatureKV> FeatureTable);

-  /// Apply the feature flag and return the newly updated feature bits.
-  FeatureBitset ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature,
-                                 ArrayRef<SubtargetFeatureKV> FeatureTable);
+  /// Apply the feature flag and update the feature bits.
+  static void ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
+                               ArrayRef<SubtargetFeatureKV> FeatureTable);

  /// Get feature bits of a CPU.
  FeatureBitset getFeatureBits(StringRef CPU,
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@ -155,11 +155,36 @@ GlobalVariable *createPGOFuncNameVar(Function &F, StringRef FuncName);
 GlobalVariable *createPGOFuncNameVar(Module &M,
                                     GlobalValue::LinkageTypes Linkage,
                                     StringRef FuncName);
+/// Return the initializer in string of the PGO name var \c NameVar.
+StringRef getPGOFuncNameVarInitializer(GlobalVariable *NameVar);

 /// Given a PGO function name, remove the filename prefix and return
 /// the original (static) function name.
 StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName, StringRef FileName);

+/// Given a vector of strings (function PGO names) \c NameStrs, the
+/// method generates a combined string \c Result thatis ready to be
+/// serialized.  The \c Result string is comprised of three fields:
+/// The first field is the legnth of the uncompressed strings, and the
+/// the second field is the length of the zlib-compressed string.
+/// Both fields are encoded in ULEB128.  If \c doCompress is false, the
+///  third field is the uncompressed strings; otherwise it is the 
+/// compressed string. When the string compression is off, the 
+/// second field will have value zero.
+int collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
+                              bool doCompression, std::string &Result);
+/// Produce \c Result string with the same format described above. The input
+/// is vector of PGO function name variables that are referenced.
+int collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars,
+                              std::string &Result);
+class InstrProfSymtab;
+/// \c NameStrings is a string composed of one of more sub-strings encoded in
+/// the
+/// format described above. The substrings are seperated by 0 or more zero
+/// bytes.
+/// This method decodes the string and populates the \c Symtab.
+int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab);
+
 const std::error_category &instrprof_category();

 enum class instrprof_error {
@ -235,6 +260,11 @@ public:
  /// This interface is used by reader of CoverageMapping test
  /// format.
  inline std::error_code create(StringRef D, uint64_t BaseAddr);
+  /// \c NameStrings is a string composed of one of more sub-strings
+  ///  encoded in the format described above. The substrings are
+  /// seperated by 0 or more zero bytes. This method decodes the
+  /// string and populates the \c Symtab.
+  inline std::error_code create(StringRef NameStrings);
  /// Create InstrProfSymtab from a set of names iteratable from
  /// \p IterRange. This interface is used by IndexedProfReader.
  template <typename NameIterRange> void create(const NameIterRange &IterRange);
@ -255,8 +285,8 @@ public:
    AddrToMD5Map.push_back(std::make_pair(Addr, MD5Val));
  }
  AddrHashMap &getAddrHashMap() { return AddrToMD5Map; }
-  /// Return function's PGO name from the function name's symabol
-  /// address in the object file. If an error occurs, Return
+  /// Return function's PGO name from the function name's symbol
+  /// address in the object file. If an error occurs, return
  /// an empty string.
  StringRef getFuncName(uint64_t FuncNameAddress, size_t NameSize);
  /// Return function's PGO name from the name's md5 hash value.
@ -270,6 +300,12 @@ std::error_code InstrProfSymtab::create(StringRef D, uint64_t BaseAddr) {
  return std::error_code();
 }

+std::error_code InstrProfSymtab::create(StringRef NameStrings) {
+  if (readPGOFuncNameStrings(NameStrings, *this))
+    return make_error_code(instrprof_error::malformed);
+  return std::error_code();
+}
+
 template <typename NameIterRange>
 void InstrProfSymtab::create(const NameIterRange &IterRange) {
  for (auto Name : IterRange)
@ -576,8 +612,14 @@ template <class IntPtrT> struct CovMapFunctionRecord {
  #define COVMAP_FUNC_RECORD(Type, LLVMType, Name, Init) Type Name;
  #include "llvm/ProfileData/InstrProfData.inc"
 };
-LLVM_PACKED_END
+// Per module coverage mapping data header, i.e. CoverageMapFileHeader
+// documented above.
+struct CovMapHeader {
+#define COVMAP_HEADER(Type, LLVMType, Name, Init) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
+};

+LLVM_PACKED_END
 }

 } // end namespace llvm
--- a/include/llvm/ProfileData/InstrProfData.inc
+++ b/include/llvm/ProfileData/InstrProfData.inc
@ -1,4 +1,4 @@
-/*===-- InstrProfData.inc - instr profiling runtime structures -----------=== *\
+/*===-- InstrProfData.inc - instr profiling runtime structures -*- C++ -*-=== *\
 |*
 |*                     The LLVM Compiler Infrastructure
 |*
@ -167,6 +167,25 @@ COVMAP_FUNC_RECORD(const uint64_t, llvm::Type::getInt64Ty(Ctx), FuncHash, \
 #undef COVMAP_FUNC_RECORD
 /* COVMAP_FUNC_RECORD end.  */

+/* COVMAP_HEADER start */
+/* Definition of member fields of coverage map header.
+ */
+#ifndef COVMAP_HEADER
+#define COVMAP_HEADER(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_DATA_DEFINED
+#endif
+COVMAP_HEADER(uint32_t, Int32Ty, NRecords, \
+              llvm::ConstantInt::get(Int32Ty,  FunctionRecords.size()))
+COVMAP_HEADER(uint32_t, Int32Ty, FilenamesSize, \
+              llvm::ConstantInt::get(Int32Ty, FilenamesSize))
+COVMAP_HEADER(uint32_t, Int32Ty, CoverageSize, \
+              llvm::ConstantInt::get(Int32Ty, CoverageMappingSize))
+COVMAP_HEADER(uint32_t, Int32Ty, Version, \
+              llvm::ConstantInt::get(Int32Ty, CoverageMappingVersion1))
+#undef COVMAP_HEADER
+/* COVMAP_HEADER end.  */
+

 #ifdef INSTR_PROF_VALUE_PROF_DATA
 #define INSTR_PROF_DATA_DEFINED
--- a/include/llvm/Support/ARMTargetParser.def
+++ b/include/llvm/Support/ARMTargetParser.def
@ -213,6 +213,7 @@ ARM_CPU_NAME("cortex-a53", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, true, AEK_CRC)
 ARM_CPU_NAME("cortex-a57", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
 ARM_CPU_NAME("cortex-a72", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
 ARM_CPU_NAME("cyclone", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
+ARM_CPU_NAME("exynos-m1", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, AEK_CRC)
 // Non-standard Arch names.
 ARM_CPU_NAME("iwmmxt", AK_IWMMXT, FK_NONE, true, AEK_NONE)
 ARM_CPU_NAME("xscale", AK_XSCALE, FK_NONE, true, AEK_NONE)
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@ -130,7 +130,7 @@ struct ProcessInfo {

  /// Return true if the given arguments fit within system-specific
  /// argument length limits.
-  bool argumentsFitWithinSystemLimits(ArrayRef<const char*> Args);
+  bool commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<const char*> Args);

  /// File encoding options when writing contents that a non-UTF8 tool will
  /// read (on Windows systems). For UNIX, we always use UTF-8.
--- a/include/llvm/Support/YAMLParser.h
+++ b/include/llvm/Support/YAMLParser.h
@ -305,7 +305,7 @@ private:
 /// increment() which must set CurrentEntry to 0 to create an end iterator.
 template <class BaseT, class ValueT>
 class basic_collection_iterator
-    : public std::iterator<std::forward_iterator_tag, ValueT> {
+    : public std::iterator<std::input_iterator_tag, ValueT> {
 public:
  basic_collection_iterator() : Base(nullptr) {}
  basic_collection_iterator(BaseT *B) : Base(B) {}
@ -326,11 +326,24 @@ public:
    return Base->CurrentEntry;
  }

+  /// Note on EqualityComparable:
+  ///
+  /// The iterator is not re-entrant,
+  /// it is meant to be used for parsing YAML on-demand
+  /// Once iteration started - it can point only to one entry at a time
+  /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
+  /// iff Base and Other.Base are equal.
+  bool operator==(const basic_collection_iterator &Other) const {
+    if (Base && (Base == Other.Base)) {
+      assert((Base->CurrentEntry == Other.Base->CurrentEntry)
+             && "Equal Bases expected to point to equal Entries");
+    }
+
+    return Base == Other.Base;
+  }
+
  bool operator!=(const basic_collection_iterator &Other) const {
-    if (Base != Other.Base)
-      return true;
-    return (Base && Other.Base) &&
-           Base->CurrentEntry != Other.Base->CurrentEntry;
+    return !(Base == Other.Base);
  }

  basic_collection_iterator &operator++() {
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@ -232,7 +232,7 @@ protected:
  /// We could pack these a bit tighter by not having the IK_FirstXXXInit
  /// and IK_LastXXXInit be their own values, but that would degrade
  /// readability for really no benefit.
-  enum InitKind {
+  enum InitKind : uint8_t {
    IK_BitInit,
    IK_FirstTypedInit,
    IK_BitsInit,
@ -256,6 +256,9 @@ protected:

 private:
  const InitKind Kind;
+protected:
+  uint8_t Opc; // Used by UnOpInit, BinOpInit, and TernOpInit
+private:
  Init(const Init &) = delete;
  Init &operator=(const Init &) = delete;
  virtual void anchor();
@ -264,7 +267,7 @@ public:
  InitKind getKind() const { return Kind; }

 protected:
-  explicit Init(InitKind K) : Kind(K) {}
+  explicit Init(InitKind K, uint8_t Opc = 0) : Kind(K), Opc(Opc) {}

 public:
  virtual ~Init() {}
@ -365,7 +368,8 @@ class TypedInit : public Init {
  TypedInit &operator=(const TypedInit &Other) = delete;

 protected:
-  explicit TypedInit(InitKind K, RecTy *T) : Init(K), Ty(T) {}
+  explicit TypedInit(InitKind K, RecTy *T, uint8_t Opc = 0)
+    : Init(K, Opc), Ty(T) {}
  ~TypedInit() override {
    // If this is a DefInit we need to delete the RecordRecTy.
    if (getKind() == IK_DefInit)
@ -650,7 +654,8 @@ class OpInit : public TypedInit {
  OpInit &operator=(OpInit &Other) = delete;

 protected:
-  explicit OpInit(InitKind K, RecTy *Type) : TypedInit(K, Type) {}
+  explicit OpInit(InitKind K, RecTy *Type, uint8_t Opc)
+    : TypedInit(K, Type, Opc) {}

 public:
  static bool classof(const Init *I) {
@ -677,14 +682,13 @@ public:
 ///
 class UnOpInit : public OpInit {
 public:
-  enum UnaryOp { CAST, HEAD, TAIL, EMPTY };
+  enum UnaryOp : uint8_t { CAST, HEAD, TAIL, EMPTY };

 private:
-  UnaryOp Opc;
  Init *LHS;

  UnOpInit(UnaryOp opc, Init *lhs, RecTy *Type)
-    : OpInit(IK_UnOpInit, Type), Opc(opc), LHS(lhs) {}
+    : OpInit(IK_UnOpInit, Type, opc), LHS(lhs) {}

  UnOpInit(const UnOpInit &Other) = delete;
  UnOpInit &operator=(const UnOpInit &Other) = delete;
@ -708,7 +712,7 @@ public:
    return getOperand();
  }

-  UnaryOp getOpcode() const { return Opc; }
+  UnaryOp getOpcode() const { return (UnaryOp)Opc; }
  Init *getOperand() const { return LHS; }

  // Fold - If possible, fold this to a simpler init.  Return this if not
@ -724,14 +728,14 @@ public:
 ///
 class BinOpInit : public OpInit {
 public:
-  enum BinaryOp { ADD, AND, SHL, SRA, SRL, LISTCONCAT, STRCONCAT, CONCAT, EQ };
+  enum BinaryOp : uint8_t { ADD, AND, SHL, SRA, SRL, LISTCONCAT,
+                            STRCONCAT, CONCAT, EQ };

 private:
-  BinaryOp Opc;
  Init *LHS, *RHS;

  BinOpInit(BinaryOp opc, Init *lhs, Init *rhs, RecTy *Type) :
-      OpInit(IK_BinOpInit, Type), Opc(opc), LHS(lhs), RHS(rhs) {}
+      OpInit(IK_BinOpInit, Type, opc), LHS(lhs), RHS(rhs) {}

  BinOpInit(const BinOpInit &Other) = delete;
  BinOpInit &operator=(const BinOpInit &Other) = delete;
@ -759,7 +763,7 @@ public:
    }
  }

-  BinaryOp getOpcode() const { return Opc; }
+  BinaryOp getOpcode() const { return (BinaryOp)Opc; }
  Init *getLHS() const { return LHS; }
  Init *getRHS() const { return RHS; }

@ -776,15 +780,14 @@ public:
 ///
 class TernOpInit : public OpInit {
 public:
-  enum TernaryOp { SUBST, FOREACH, IF };
+  enum TernaryOp : uint8_t { SUBST, FOREACH, IF };

 private:
-  TernaryOp Opc;
  Init *LHS, *MHS, *RHS;

  TernOpInit(TernaryOp opc, Init *lhs, Init *mhs, Init *rhs,
             RecTy *Type) :
-      OpInit(IK_TernOpInit, Type), Opc(opc), LHS(lhs), MHS(mhs), RHS(rhs) {}
+      OpInit(IK_TernOpInit, Type, opc), LHS(lhs), MHS(mhs), RHS(rhs) {}

  TernOpInit(const TernOpInit &Other) = delete;
  TernOpInit &operator=(const TernOpInit &Other) = delete;
@ -815,7 +818,7 @@ public:
    }
  }

-  TernaryOp getOpcode() const { return Opc; }
+  TernaryOp getOpcode() const { return (TernaryOp)Opc; }
  Init *getLHS() const { return LHS; }
  Init *getMHS() const { return MHS; }
  Init *getRHS() const { return RHS; }
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@ -936,6 +936,10 @@ class AsmParser {
  // ShouldEmitMatchRegisterName - Set to false if the target needs a hand
  // written register name matcher
  bit ShouldEmitMatchRegisterName = 1;
+
+  // HasMnemonicFirst - Set to false if target instructions don't always
+  // start with a mnemonic as the first token.
+  bit HasMnemonicFirst = 1;
 }
 def DefaultAsmParser : AsmParser;

--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@ -2269,6 +2269,12 @@ public:
    return false;
  }

+  /// Return true if the MachineFunction contains a COPY which would imply
+  /// HasOpaqueSPAdjustment.
+  virtual bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const {
+    return false;
+  }
+
  /// Perform necessary initialization to handle a subset of CSRs explicitly
  /// via copies. This function is called at the beginning of instruction
  /// selection.
--- a/include/llvm/Transforms/Utils/BypassSlowDivision.h
+++ b/include/llvm/Transforms/Utils/BypassSlowDivision.h
@ -23,11 +23,13 @@

 namespace llvm {

-/// This optimization identifies DIV instructions that can be
+/// This optimization identifies DIV instructions in a BB that can be
 /// profitably bypassed and carried out with a shorter, faster divide.
-bool bypassSlowDivision(Function &F,
-                        Function::iterator &I,
-                        const DenseMap<unsigned int, unsigned int> &BypassWidth);
+///
+/// This optimization may add basic blocks immediately after BB; for obvious
+/// reasons, you shouldn't pass those blocks to bypassSlowDivision.
+bool bypassSlowDivision(
+    BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidth);

 } // End llvm namespace

--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@ -16,6 +16,7 @@

 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"

@ -39,6 +40,8 @@ struct LICMSafetyInfo {
  bool MayThrow;           // The current loop contains an instruction which
                           // may throw.
  bool HeaderMayThrow;     // Same as previous, but specific to loop header
+  // Used to update funclet bundle operands.
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
  LICMSafetyInfo() : MayThrow(false), HeaderMayThrow(false)
  {}
 };
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@ -543,7 +543,6 @@ static bool isMemsetPattern16(const Function *MS,
        isa<IntegerType>(MemsetType->getParamType(2)))
      return true;
  }
-
  return false;
 }

@ -583,9 +582,6 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) {
  if (F->onlyAccessesArgMemory())
    Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);

-  if (isMemsetPattern16(F, TLI))
-    Min = FMRB_OnlyAccessesArgumentPointees;
-
  // Otherwise be conservative.
  return FunctionModRefBehavior(AAResultBase::getModRefBehavior(F) & Min);
 }
@ -599,22 +595,21 @@ ModRefInfo BasicAAResult::getArgModRefInfo(ImmutableCallSite CS,
    case Intrinsic::memset:
    case Intrinsic::memcpy:
    case Intrinsic::memmove:
-      assert((ArgIdx == 0 || ArgIdx == 1) &&
-             "Invalid argument index for memory intrinsic");
-      return ArgIdx ? MRI_Ref : MRI_Mod;
+      // We don't currently have a writeonly attribute.  All other properties
+      // of these intrinsics are nicely described via attributes in
+      // Intrinsics.td and handled generically below.
+      if (ArgIdx == 0)
+        return MRI_Mod;
    }

  // We can bound the aliasing properties of memset_pattern16 just as we can
  // for memcpy/memset.  This is particularly important because the
  // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
-  // whenever possible.
-  if (CS.getCalledFunction() &&
-      isMemsetPattern16(CS.getCalledFunction(), TLI)) {
-    assert((ArgIdx == 0 || ArgIdx == 1) &&
-           "Invalid argument index for memset_pattern16");
-    return ArgIdx ? MRI_Ref : MRI_Mod;
-  }
-  // FIXME: Handle memset_pattern4 and memset_pattern8 also.
+  // whenever possible.  Note that all but the missing writeonly attribute are
+  // handled via InferFunctionAttr.
+  if (CS.getCalledFunction() && isMemsetPattern16(CS.getCalledFunction(), TLI))
+    if (ArgIdx == 0)
+      return MRI_Mod;

  if (CS.paramHasAttr(ArgIdx + 1, Attribute::ReadOnly))
    return MRI_Ref;
--- a/lib/Analysis/GlobalsModRef.cpp
+++ b/lib/Analysis/GlobalsModRef.cpp
@ -376,15 +376,6 @@ bool GlobalsAAResult::AnalyzeUsesOfPointer(Value *V,
        } else {
          return true; // Argument of an unknown call.
        }
-        // If the Callee is not ReadNone, it may read the global,
-        // and if it is not ReadOnly, it may also write to it.
-        Function *CalleeF = CS.getCalledFunction();
-        if (!CalleeF->doesNotAccessMemory()) {
-          if (Readers)
-            Readers->insert(CalleeF);
-          if (Writers && !CalleeF->onlyReadsMemory())
-            Writers->insert(CalleeF);
-        }
      }
    } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
      if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
@ -516,7 +507,7 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {

      if (F->isDeclaration()) {
        // Try to get mod/ref behaviour from function attributes.
-        if (F->doesNotAccessMemory() || F->onlyAccessesInaccessibleMemory()) {
+        if (F->doesNotAccessMemory()) {
          // Can't do better than that!
        } else if (F->onlyReadsMemory()) {
          FI.addModRefInfo(MRI_Ref);
@ -524,12 +515,6 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
            // This function might call back into the module and read a global -
            // consider every global as possibly being read by this function.
            FI.setMayReadAnyGlobal();
-        } else if (F->onlyAccessesArgMemory() || 
-                   F->onlyAccessesInaccessibleMemOrArgMem()) {
-          // This function may only access (read/write) memory pointed to by its
-          // arguments. If this pointer is to a global, this escaping use of the
-          // pointer is captured in AnalyzeUsesOfPointer().
-          FI.addModRefInfo(MRI_ModRef);
        } else {
          FI.addModRefInfo(MRI_ModRef);
          // Can't say anything useful unless it's an intrinsic - they don't
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@ -187,13 +187,6 @@ bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
  return getAllocationData(V, AllocLike, TLI, LookThroughBitCast);
 }

-/// \brief Tests if a value is a call or invoke to a library function that
-/// allocates memory and never returns null (such as operator new).
-bool llvm::isOperatorNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                               bool LookThroughBitCast) {
-  return getAllocationData(V, OpNewLike, TLI, LookThroughBitCast);
-}
-
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@ -477,7 +477,7 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
  // being 42. A key property of this program however is that if either
  // 1 or 4 were missing, there would be a race between the store of 42
  // either the store of 0 or the load (making the whole progam racy).
-  // The paper mentionned above shows that the same property is respected
+  // The paper mentioned above shows that the same property is respected
  // by every program that can detect any optimisation of that kind: either
  // it is racy (undefined) or there is a release followed by an acquire
  // between the pair of accesses under consideration.
@ -685,13 +685,13 @@ MemDepResult MemoryDependenceAnalysis::getSimplePointerDependencyFrom(
        return MemDepResult::getDef(Inst);
      if (isInvariantLoad)
        continue;
-      // Be conservative if the accessed pointer may alias the allocation.
-      if (AA->alias(Inst, AccessPtr) != NoAlias)
-        return MemDepResult::getClobber(Inst);
-      // If the allocation is not aliased and does not read memory (like
-      // strdup), it is safe to ignore.
-      if (isa<AllocaInst>(Inst) ||
-          isMallocLikeFn(Inst, TLI) || isCallocLikeFn(Inst, TLI))
+      // Be conservative if the accessed pointer may alias the allocation -
+      // fallback to the generic handling below.
+      if ((AA->alias(Inst, AccessPtr) == NoAlias) &&
+          // If the allocation is not aliased and does not read memory (like
+          // strdup), it is safe to ignore.
+          (isa<AllocaInst>(Inst) || isMallocLikeFn(Inst, TLI) ||
+           isCallocLikeFn(Inst, TLI)))
        continue;
    }

@ -792,10 +792,8 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
 static void AssertSorted(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
                         int Count = -1) {
  if (Count == -1) Count = Cache.size();
-  if (Count == 0) return;
-
-  for (unsigned i = 1; i != unsigned(Count); ++i)
-    assert(!(Cache[i] < Cache[i-1]) && "Cache isn't sorted!");
+  assert(std::is_sorted(Cache.begin(), Cache.begin() + Count) &&
+         "Cache isn't sorted!");
 }
 #endif

--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@ -52,14 +52,13 @@ static bool hasSinCosPiStret(const Triple &T) {
 /// specified target triple.  This should be carefully written so that a missing
 /// target triple gets a sane set of defaults.
 static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
-                       const char *const *StandardNames) {
-#ifndef NDEBUG
+                       ArrayRef<const char *> StandardNames) {
  // Verify that the StandardNames array is in alphabetical order.
-  for (unsigned F = 1; F < LibFunc::NumLibFuncs; ++F) {
-    if (strcmp(StandardNames[F-1], StandardNames[F]) >= 0)
-      llvm_unreachable("TargetLibraryInfoImpl function names must be sorted");
-  }
-#endif // !NDEBUG
+  assert(std::is_sorted(StandardNames.begin(), StandardNames.end(),
+                        [](const char *LHS, const char *RHS) {
+                          return strcmp(LHS, RHS) < 0;
+                        }) &&
+         "TargetLibraryInfoImpl function names must be sorted");

  if (T.getArch() == Triple::r600 ||
      T.getArch() == Triple::amdgcn) {
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@ -1743,9 +1743,10 @@ bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth,
    return false;

  Value *X = nullptr, *Y = nullptr;
-  // A shift of a power of two is a power of two or zero.
+  // A shift left or a logical shift right of a power of two is a power of two
+  // or zero.
  if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) ||
-                 match(V, m_Shr(m_Value(X), m_Value()))))
+                 match(V, m_LShr(m_Value(X), m_Value()))))
    return isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q, DL);

  if (ZExtInst *ZI = dyn_cast<ZExtInst>(V))
@ -2829,7 +2830,12 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
                                              const DataLayout &DL) {
  unsigned BitWidth = DL.getPointerTypeSizeInBits(Ptr->getType());
  APInt ByteOffset(BitWidth, 0);
-  while (1) {
+
+  // We walk up the defs but use a visited set to handle unreachable code. In
+  // that case, we stop after accumulating the cycle once (not that it
+  // matters).
+  SmallPtrSet<Value *, 16> Visited;
+  while (Visited.insert(Ptr).second) {
    if (Ptr->getType()->isVectorTy())
      break;

@ -3268,12 +3274,9 @@ static bool isDereferenceableAndAlignedPointer(
  }

  // For gc.relocate, look through relocations
-  if (const IntrinsicInst *I = dyn_cast<IntrinsicInst>(V))
-    if (I->getIntrinsicID() == Intrinsic::experimental_gc_relocate) {
-      GCRelocateOperands RelocateInst(I);
-      return isDereferenceableAndAlignedPointer(
-          RelocateInst.getDerivedPtr(), Align, DL, CtxI, DT, TLI, Visited);
-    }
+  if (const GCRelocateInst *RelocateInst = dyn_cast<GCRelocateInst>(V))
+    return isDereferenceableAndAlignedPointer(
+        RelocateInst->getDerivedPtr(), Align, DL, CtxI, DT, TLI, Visited);

  if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(V))
    return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Align, DL,
@ -3474,10 +3477,6 @@ bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
    if (CS.isReturnNonNull())
      return true;

-  // operator new never returns null.
-  if (isOperatorNewLikeFn(V, TLI, /*LookThroughBitCast=*/true))
-    return true;
-
  return false;
 }

--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@ -3071,7 +3071,12 @@ void BitcodeReader::saveMetadataList(
  for (unsigned ID = 0; ID < MetadataList.size(); ++ID) {
    Metadata *MD = MetadataList[ID];
    auto *N = dyn_cast_or_null<MDNode>(MD);
+    assert((!N || (N->isResolved() || N->isTemporary())) &&
+           "Found non-resolved non-temp MDNode while saving metadata");
    // Save all values if !OnlyTempMD, otherwise just the temporary metadata.
+    // Note that in the !OnlyTempMD case we need to save all Metadata, not
+    // just MDNode, as we may have references to other types of module-level
+    // metadata (e.g. ValueAsMetadata) from instructions.
    if (!OnlyTempMD || (N && N->isTemporary())) {
      // Will call this after materializing each function, in order to
      // handle remapping of the function's instructions/metadata.
@ -3080,6 +3085,11 @@ void BitcodeReader::saveMetadataList(
        assert(MetadataToIDs[MD] == ID && "Inconsistent metadata value id");
        continue;
      }
+      if (N && N->isTemporary())
+        // Ensure that we assert if someone tries to RAUW this temporary
+        // metadata while it is the key of a map. The flag will be set back
+        // to true when the saved metadata list is destroyed.
+        N->setCanReplace(false);
      MetadataToIDs[MD] = ID;
    }
  }
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@ -976,32 +976,32 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
  }
 }

-static int getRank(const WinEHFuncInfo &FuncInfo, int State) {
+static int getTryRank(const WinEHFuncInfo &FuncInfo, int State) {
  int Rank = 0;
  while (State != -1) {
    ++Rank;
-    State = FuncInfo.ClrEHUnwindMap[State].Parent;
+    State = FuncInfo.ClrEHUnwindMap[State].TryParentState;
  }
  return Rank;
 }

-static int getAncestor(const WinEHFuncInfo &FuncInfo, int Left, int Right) {
-  int LeftRank = getRank(FuncInfo, Left);
-  int RightRank = getRank(FuncInfo, Right);
+static int getTryAncestor(const WinEHFuncInfo &FuncInfo, int Left, int Right) {
+  int LeftRank = getTryRank(FuncInfo, Left);
+  int RightRank = getTryRank(FuncInfo, Right);

  while (LeftRank < RightRank) {
-    Right = FuncInfo.ClrEHUnwindMap[Right].Parent;
+    Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState;
    --RightRank;
  }

  while (RightRank < LeftRank) {
-    Left = FuncInfo.ClrEHUnwindMap[Left].Parent;
+    Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState;
    --LeftRank;
  }

  while (Left != Right) {
-    Left = FuncInfo.ClrEHUnwindMap[Left].Parent;
-    Right = FuncInfo.ClrEHUnwindMap[Right].Parent;
+    Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState;
+    Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState;
  }

  return Left;
@ -1035,9 +1035,9 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
        FuncInfo.ClrEHUnwindMap[State].Handler.get<MachineBasicBlock *>();
    HandlerStates[HandlerBlock] = State;
    // Use this loop through all handlers to verify our assumption (used in
-    // the MinEnclosingState computation) that ancestors have lower state
-    // numbers than their descendants.
-    assert(FuncInfo.ClrEHUnwindMap[State].Parent < State &&
+    // the MinEnclosingState computation) that enclosing funclets have lower
+    // state numbers than their enclosed funclets.
+    assert(FuncInfo.ClrEHUnwindMap[State].HandlerParentState < State &&
           "ill-formed state numbering");
  }
  // Map the main function to the NullState.
@ -1070,7 +1070,6 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
  SmallVector<int, 4> MinClauseMap((size_t)NumStates, NumStates);

  // Visit the root function and each funclet.
-
  for (MachineFunction::const_iterator FuncletStart = MF->begin(),
                                       FuncletEnd = MF->begin(),
                                       End = MF->end();
@ -1100,17 +1099,18 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
    for (const auto &StateChange :
         InvokeStateChangeIterator::range(FuncInfo, FuncletStart, FuncletEnd)) {
      // Close any try regions we're not still under
-      int AncestorState =
-          getAncestor(FuncInfo, CurrentState, StateChange.NewState);
-      while (CurrentState != AncestorState) {
-        assert(CurrentState != NullState && "Failed to find ancestor!");
+      int StillPendingState =
+          getTryAncestor(FuncInfo, CurrentState, StateChange.NewState);
+      while (CurrentState != StillPendingState) {
+        assert(CurrentState != NullState &&
+               "Failed to find still-pending state!");
        // Close the pending clause
        Clauses.push_back({CurrentStartLabel, StateChange.PreviousEndLabel,
                           CurrentState, FuncletState});
-        // Now the parent handler is current
-        CurrentState = FuncInfo.ClrEHUnwindMap[CurrentState].Parent;
+        // Now the next-outer try region is current
+        CurrentState = FuncInfo.ClrEHUnwindMap[CurrentState].TryParentState;
        // Pop the new start label from the handler stack if we've exited all
-        // descendants of the corresponding handler.
+        // inner try regions of the corresponding try region.
        if (HandlerStack.back().second == CurrentState)
          CurrentStartLabel = HandlerStack.pop_back_val().first;
      }
@ -1121,7 +1121,8 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
        // it.
        for (int EnteredState = StateChange.NewState;
             EnteredState != CurrentState;
-             EnteredState = FuncInfo.ClrEHUnwindMap[EnteredState].Parent) {
+             EnteredState =
+                 FuncInfo.ClrEHUnwindMap[EnteredState].TryParentState) {
          int &MinEnclosingState = MinClauseMap[EnteredState];
          if (FuncletState < MinEnclosingState)
            MinEnclosingState = FuncletState;
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@ -225,8 +225,14 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
  if (!OptSize && TLI && TLI->isSlowDivBypassed()) {
    const DenseMap<unsigned int, unsigned int> &BypassWidths =
       TLI->getBypassSlowDivWidths();
-    for (Function::iterator I = F.begin(); I != F.end(); I++)
-      EverMadeChange |= bypassSlowDivision(F, I, BypassWidths);
+    BasicBlock* BB = &*F.begin();
+    while (BB != nullptr) {
+      // bypassSlowDivision may create new BBs, but we don't want to reapply the
+      // optimization to those blocks.
+      BasicBlock* Next = BB->getNextNode();
+      EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
+      BB = Next;
+    }
  }

  // Eliminate blocks that contain only PHI nodes and an
@ -526,19 +532,17 @@ void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
 // Computes a map of base pointer relocation instructions to corresponding
 // derived pointer relocation instructions given a vector of all relocate calls
 static void computeBaseDerivedRelocateMap(
-    const SmallVectorImpl<User *> &AllRelocateCalls,
-    DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> &
-        RelocateInstMap) {
+    const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
+    DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>>
+        &RelocateInstMap) {
  // Collect information in two maps: one primarily for locating the base object
  // while filling the second map; the second map is the final structure holding
  // a mapping between Base and corresponding Derived relocate calls
-  DenseMap<std::pair<unsigned, unsigned>, IntrinsicInst *> RelocateIdxMap;
-  for (auto &U : AllRelocateCalls) {
-    GCRelocateOperands ThisRelocate(U);
-    IntrinsicInst *I = cast<IntrinsicInst>(U);
-    auto K = std::make_pair(ThisRelocate.getBasePtrIndex(),
-                            ThisRelocate.getDerivedPtrIndex());
-    RelocateIdxMap.insert(std::make_pair(K, I));
+  DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
+  for (auto *ThisRelocate : AllRelocateCalls) {
+    auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
+                            ThisRelocate->getDerivedPtrIndex());
+    RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
  }
  for (auto &Item : RelocateIdxMap) {
    std::pair<unsigned, unsigned> Key = Item.first;
@ -546,7 +550,7 @@ static void computeBaseDerivedRelocateMap(
      // Base relocation: nothing to insert
      continue;

-    IntrinsicInst *I = Item.second;
+    GCRelocateInst *I = Item.second;
    auto BaseKey = std::make_pair(Key.first, Key.first);

    // We're iterating over RelocateIdxMap so we cannot modify it.
@ -579,16 +583,13 @@ static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
 // Takes a RelocatedBase (base pointer relocation instruction) and Targets to
 // replace, computes a replacement, and affects it.
 static bool
-simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
-                          const SmallVectorImpl<IntrinsicInst *> &Targets) {
+simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
+                          const SmallVectorImpl<GCRelocateInst *> &Targets) {
  bool MadeChange = false;
-  for (auto &ToReplace : Targets) {
-    GCRelocateOperands MasterRelocate(RelocatedBase);
-    GCRelocateOperands ThisRelocate(ToReplace);
-
-    assert(ThisRelocate.getBasePtrIndex() == MasterRelocate.getBasePtrIndex() &&
+  for (GCRelocateInst *ToReplace : Targets) {
+    assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
           "Not relocating a derived object of the original base object");
-    if (ThisRelocate.getBasePtrIndex() == ThisRelocate.getDerivedPtrIndex()) {
+    if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
      // A duplicate relocate call. TODO: coalesce duplicates.
      continue;
    }
@ -601,8 +602,8 @@ simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
      continue;
    }

-    Value *Base = ThisRelocate.getBasePtr();
-    auto Derived = dyn_cast<GetElementPtrInst>(ThisRelocate.getDerivedPtr());
+    Value *Base = ToReplace->getBasePtr();
+    auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
    if (!Derived || Derived->getPointerOperand() != Base)
      continue;

@ -680,12 +681,12 @@ simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
 // %val = load %ptr'
 bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
  bool MadeChange = false;
-  SmallVector<User *, 2> AllRelocateCalls;
+  SmallVector<GCRelocateInst *, 2> AllRelocateCalls;

  for (auto *U : I.users())
-    if (isGCRelocate(dyn_cast<Instruction>(U)))
+    if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
      // Collect all the relocate calls associated with a statepoint
-      AllRelocateCalls.push_back(U);
+      AllRelocateCalls.push_back(Relocate);

  // We need atleast one base pointer relocation + one derived pointer
  // relocation to mangle
@ -694,7 +695,7 @@ bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {

  // RelocateInstMap is a mapping from the base relocate instruction to the
  // corresponding derived relocate instructions
-  DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> RelocateInstMap;
+  DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap;
  computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
  if (RelocateInstMap.empty())
    return false;
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@ -122,8 +122,7 @@ INITIALIZE_PASS_END(MachineCSE, "machine-cse",
 bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
                                               MachineBasicBlock *MBB) {
  bool Changed = false;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
+  for (MachineOperand &MO : MI->operands()) {
    if (!MO.isReg() || !MO.isUse())
      continue;
    unsigned Reg = MO.getReg();
@ -186,8 +185,7 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
      return true;

    bool SeenDef = false;
-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = I->getOperand(i);
+    for (const MachineOperand &MO : I->operands()) {
      if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
        SeenDef = true;
      if (!MO.isReg() || !MO.getReg())
@ -220,8 +218,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
                                       SmallVectorImpl<unsigned> &PhysDefs,
                                       bool &PhysUseDef) const{
  // First, add all uses to PhysRefs.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
    if (!MO.isReg() || MO.isDef())
      continue;
    unsigned Reg = MO.getReg();
@ -239,8 +236,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
  // (which currently contains only uses), set the PhysUseDef flag.
  PhysUseDef = false;
  MachineBasicBlock::const_iterator I = MI; I = std::next(I);
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
    if (!MO.isReg() || !MO.isDef())
      continue;
    unsigned Reg = MO.getReg();
@ -311,8 +307,7 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
    if (I == E)
      return true;

-    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = I->getOperand(i);
+    for (const MachineOperand &MO : I->operands()) {
      // RegMasks go on instructions like calls that clobber lots of physregs.
      // Don't attempt to CSE across such an instruction.
      if (MO.isRegMask())
@ -398,8 +393,7 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
  // Heuristics #2: If the expression doesn't not use a vr and the only use
  // of the redundant computation are copies, do not cse.
  bool HasVRegUse = false;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
    if (MO.isReg() && MO.isUse() &&
        TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
      HasVRegUse = true;
@ -580,9 +574,9 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {

    // Actually perform the elimination.
    if (DoCSE) {
-      for (unsigned i = 0, e = CSEPairs.size(); i != e; ++i) {
-        unsigned OldReg = CSEPairs[i].first;
-        unsigned NewReg = CSEPairs[i].second;
+      for (std::pair<unsigned, unsigned> &CSEPair : CSEPairs) {
+        unsigned OldReg = CSEPair.first;
+        unsigned NewReg = CSEPair.second;
        // OldReg may have been unused but is used now, clear the Dead flag
        MachineInstr *Def = MRI->getUniqueVRegDef(NewReg);
        assert(Def != nullptr && "CSEd register has no unique definition?");
@ -594,8 +588,8 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {

      // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
      // we should make sure it is not dead at CSMI.
-      for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i)
-        CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false);
+      for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate)
+        CSMI->getOperand(ImplicitDefToUpdate).setIsDead(false);

      // Go through implicit defs of CSMI and MI, and clear the kill flags on
      // their uses in all the instructions between CSMI and MI.
@ -685,18 +679,14 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
    Node = WorkList.pop_back_val();
    Scopes.push_back(Node);
    const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
-    unsigned NumChildren = Children.size();
-    OpenChildren[Node] = NumChildren;
-    for (unsigned i = 0; i != NumChildren; ++i) {
-      MachineDomTreeNode *Child = Children[i];
+    OpenChildren[Node] = Children.size();
+    for (MachineDomTreeNode *Child : Children)
      WorkList.push_back(Child);
-    }
  } while (!WorkList.empty());

  // Now perform CSE.
  bool Changed = false;
-  for (unsigned i = 0, e = Scopes.size(); i != e; ++i) {
-    MachineDomTreeNode *Node = Scopes[i];
+  for (MachineDomTreeNode *Node : Scopes) {
    MachineBasicBlock *MBB = Node->getBlock();
    EnterScope(MBB);
    Changed |= ProcessBlock(MBB);
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@ -866,6 +866,27 @@ void MachineInstr::addMemOperand(MachineFunction &MF,
  setMemRefs(NewMemRefs, NewMemRefs + NewNum);
 }

+std::pair<MachineInstr::mmo_iterator, unsigned>
+MachineInstr::mergeMemRefsWith(const MachineInstr& Other) {
+  // TODO: If we end up with too many memory operands, return the empty
+  // conservative set rather than failing asserts.
+  // TODO: consider uniquing elements within the operand lists to reduce
+  // space usage and fall back to conservative information less often.
+  size_t CombinedNumMemRefs = (memoperands_end() - memoperands_begin())
+    + (Other.memoperands_end() - Other.memoperands_begin());
+
+  MachineFunction *MF = getParent()->getParent();
+  mmo_iterator MemBegin = MF->allocateMemRefsArray(CombinedNumMemRefs);
+  mmo_iterator MemEnd = std::copy(memoperands_begin(), memoperands_end(),
+                                  MemBegin);
+  MemEnd = std::copy(Other.memoperands_begin(), Other.memoperands_end(),
+                     MemEnd);
+  assert(MemEnd - MemBegin == (ptrdiff_t)CombinedNumMemRefs &&
+         "missing memrefs");
+  
+  return std::make_pair(MemBegin, CombinedNumMemRefs);
+}
+
 bool MachineInstr::hasPropertyInBundle(unsigned Mask, QueryType Type) const {
  assert(!isBundledWithPred() && "Must be called on bundle header");
  for (MachineBasicBlock::const_instr_iterator MII = getIterator();; ++MII) {
@ -1738,7 +1759,10 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
  bool HaveSemi = false;
  const unsigned PrintableFlags = FrameSetup | FrameDestroy;
  if (Flags & PrintableFlags) {
-    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    if (!HaveSemi) {
+      OS << ";";
+      HaveSemi = true;
+    }
    OS << " flags: ";

    if (Flags & FrameSetup)
@ -1749,7 +1773,10 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
  }

  if (!memoperands_empty()) {
-    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    if (!HaveSemi) {
+      OS << ";";
+      HaveSemi = true;
+    }

    OS << " mem:";
    for (mmo_iterator i = memoperands_begin(), e = memoperands_end();
@ -1762,7 +1789,10 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,

  // Print the regclass of any virtual registers encountered.
  if (MRI && !VirtRegs.empty()) {
-    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    if (!HaveSemi) {
+      OS << ";";
+      HaveSemi = true;
+    }
    for (unsigned i = 0; i != VirtRegs.size(); ++i) {
      const TargetRegisterClass *RC = MRI->getRegClass(VirtRegs[i]);
      OS << " " << TRI->getRegClassName(RC)
@ -1781,7 +1811,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,

  // Print debug location information.
  if (isDebugValue() && getOperand(e - 2).isMetadata()) {
-    if (!HaveSemi) OS << ";";
+    if (!HaveSemi)
+      OS << ";";
    auto *DV = cast<DILocalVariable>(getOperand(e - 2).getMetadata());
    OS << " line no:" <<  DV->getLine();
    if (auto *InlinedAt = debugLoc->getInlinedAt()) {
@ -1795,7 +1826,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
    if (isIndirectDebugValue())
      OS << " indirect";
  } else if (debugLoc && MF) {
-    if (!HaveSemi) OS << ";";
+    if (!HaveSemi)
+      OS << ";";
    OS << " dbg:";
    debugLoc.print(OS);
  }
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@ -315,7 +315,7 @@ MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
    if (!TRI->regsOverlap(MOReg, Reg))
      continue;

-    bool Covered = TRI->isSuperRegisterEq(MOReg, Reg);
+    bool Covered = TRI->isSuperRegisterEq(Reg, MOReg);
    if (MO.readsReg()) {
      PRI.Read = true;
      if (Covered) {
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@ -97,9 +97,8 @@ void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
    unsigned Weight = PSetI.getWeight();
    for (; PSetI.isValid(); ++PSetI) {
      CurrSetPressure[*PSetI] += Weight;
-      if (CurrSetPressure[*PSetI] > P.MaxSetPressure[*PSetI]) {
-        P.MaxSetPressure[*PSetI] = CurrSetPressure[*PSetI];
-      }
+      P.MaxSetPressure[*PSetI] =
+          std::max(P.MaxSetPressure[*PSetI], CurrSetPressure[*PSetI]);
    }
  }
 }
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -6843,9 +6843,13 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
  uint64_t PtrOff = ShAmt / 8;
  unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
  SDLoc DL(LN0);
+  // The original load itself didn't wrap, so an offset within it doesn't.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
  SDValue NewPtr = DAG.getNode(ISD::ADD, DL,
                               PtrType, LN0->getBasePtr(),
-                               DAG.getConstant(PtrOff, DL, PtrType));
+                               DAG.getConstant(PtrOff, DL, PtrType),
+                               &Flags);
  AddToWorklist(NewPtr.getNode());

  SDValue Load;
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -2843,6 +2843,43 @@ bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
  return (AZero | BZero).isAllOnesValue();
 }

+static SDValue FoldCONCAT_VECTORS(SDLoc DL, EVT VT, ArrayRef<SDValue> Ops,
+                                  llvm::SelectionDAG &DAG) {
+  if (Ops.size() == 1)
+    return Ops[0];
+
+  // Concat of UNDEFs is UNDEF.
+  if (std::all_of(Ops.begin(), Ops.end(),
+                  [](SDValue Op) { return Op.isUndef(); }))
+    return DAG.getUNDEF(VT);
+
+  // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified
+  // to one big BUILD_VECTOR.
+  // FIXME: Add support for UNDEF and SCALAR_TO_VECTOR as well.
+  if (!std::all_of(Ops.begin(), Ops.end(), [](SDValue Op) {
+        return Op.getOpcode() == ISD::BUILD_VECTOR;
+      }))
+    return SDValue();
+
+  EVT SVT = VT.getScalarType();
+  SmallVector<SDValue, 16> Elts;
+  for (SDValue Op : Ops)
+    Elts.append(Op->op_begin(), Op->op_end());
+
+  // BUILD_VECTOR requires all inputs to be of the same type, find the
+  // maximum type and extend them all.
+  for (SDValue Op : Elts)
+    SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
+
+  if (SVT.bitsGT(VT.getScalarType()))
+    for (SDValue &Op : Elts)
+      Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
+               ? DAG.getZExtOrTrunc(Op, DL, SVT)
+               : DAG.getSExtOrTrunc(Op, DL, SVT);
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
+}
+
 /// getNode - Gets or creates the specified node.
 ///
 SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) {
@ -3426,34 +3463,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
    if (N2.getOpcode() == ISD::EntryToken) return N1;
    if (N1 == N2) return N1;
    break;
-  case ISD::CONCAT_VECTORS:
-    // Concat of UNDEFs is UNDEF.
-    if (N1.getOpcode() == ISD::UNDEF &&
-        N2.getOpcode() == ISD::UNDEF)
-      return getUNDEF(VT);
-
-    // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
-    // one big BUILD_VECTOR.
-    if (N1.getOpcode() == ISD::BUILD_VECTOR &&
-        N2.getOpcode() == ISD::BUILD_VECTOR) {
-      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
-                                    N1.getNode()->op_end());
-      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
-
-      // BUILD_VECTOR requires all inputs to be of the same type, find the
-      // maximum type and extend them all.
-      EVT SVT = VT.getScalarType();
-      for (SDValue Op : Elts)
-        SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
-      if (SVT.bitsGT(VT.getScalarType()))
-        for (SDValue &Op : Elts)
-          Op = TLI->isZExtFree(Op.getValueType(), SVT)
-             ? getZExtOrTrunc(Op, DL, SVT)
-             : getSExtOrTrunc(Op, DL, SVT);
-
-      return getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
-    }
+  case ISD::CONCAT_VECTORS: {
+    // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+    SDValue Ops[] = {N1, N2};
+    if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
    break;
+  }
  case ISD::AND:
    assert(VT.isInteger() && "This operator does not apply to FP types!");
    assert(N1.getValueType() == N2.getValueType() &&
@ -3911,19 +3927,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
    }
    break;
  }
-  case ISD::CONCAT_VECTORS:
-    // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
-    // one big BUILD_VECTOR.
-    if (N1.getOpcode() == ISD::BUILD_VECTOR &&
-        N2.getOpcode() == ISD::BUILD_VECTOR &&
-        N3.getOpcode() == ISD::BUILD_VECTOR) {
-      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
-                                    N1.getNode()->op_end());
-      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
-      Elts.append(N3.getNode()->op_begin(), N3.getNode()->op_end());
-      return getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
-    }
+  case ISD::CONCAT_VECTORS: {
+    // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+    SDValue Ops[] = {N1, N2, N3};
+    if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
    break;
+  }
  case ISD::SETCC: {
    // Use FoldSetCC to simplify SETCC's.
    if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
@ -5462,6 +5472,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,

  switch (Opcode) {
  default: break;
+  case ISD::CONCAT_VECTORS: {
+    // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+    if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
  case ISD::SELECT_CC: {
    assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
    assert(Ops[0].getValueType() == Ops[1].getValueType() &&
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -1329,12 +1329,18 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
    ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &Offsets);
    unsigned NumValues = ValueVTs.size();

+    // An aggregate return value cannot wrap around the address space, so
+    // offsets to its parts don't wrap either.
+    SDNodeFlags Flags;
+    Flags.setNoUnsignedWrap(true);
+
    SmallVector<SDValue, 4> Chains(NumValues);
    for (unsigned i = 0; i != NumValues; ++i) {
      SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(),
                                RetPtr.getValueType(), RetPtr,
                                DAG.getIntPtrConstant(Offsets[i],
-                                                      getCurSDLoc()));
+                                                      getCurSDLoc()),
+                                &Flags);
      Chains[i] =
        DAG.getStore(Chain, getCurSDLoc(),
                     SDValue(RetOp.getNode(), RetOp.getResNo() + i),
@ -2994,8 +3000,15 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
      if (Field) {
        // N = N + Offset
        uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
+
+        // In an inbouds GEP with an offset that is nonnegative even when
+        // interpreted as signed, assume there is no unsigned overflow.
+        SDNodeFlags Flags;
+        if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
+          Flags.setNoUnsignedWrap(true);
+
        N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N,
-                        DAG.getConstant(Offset, dl, N.getValueType()));
+                        DAG.getConstant(Offset, dl, N.getValueType()), &Flags);
      }

      Ty = StTy->getElementType(Field);
@ -3020,7 +3033,14 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
        SDValue OffsVal = VectorWidth ?
          DAG.getConstant(Offs, dl, MVT::getVectorVT(PtrTy, VectorWidth)) :
          DAG.getConstant(Offs, dl, PtrTy);
-        N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal);
+
+        // In an inbouds GEP with an offset that is nonnegative even when
+        // interpreted as signed, assume there is no unsigned overflow.
+        SDNodeFlags Flags;
+        if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
+          Flags.setNoUnsignedWrap(true);
+
+        N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, &Flags);
        continue;
      }

@ -3092,10 +3112,13 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
    Align = 0;

  // Round the size of the allocation up to the stack alignment size
-  // by add SA-1 to the size.
+  // by add SA-1 to the size. This doesn't overflow because we're computing
+  // an address inside an alloca.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
  AllocSize = DAG.getNode(ISD::ADD, dl,
                          AllocSize.getValueType(), AllocSize,
-                          DAG.getIntPtrConstant(StackAlign - 1, dl));
+                          DAG.getIntPtrConstant(StackAlign - 1, dl), &Flags);

  // Mask out the low bits for alignment purposes.
  AllocSize = DAG.getNode(ISD::AND, dl,
@ -3168,6 +3191,11 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
  if (isVolatile)
    Root = TLI.prepareVolatileOrAtomicLoad(Root, dl, DAG);

+  // An aggregate load cannot wrap around the address space, so offsets to its
+  // parts don't wrap either.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
+
  SmallVector<SDValue, 4> Values(NumValues);
  SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
  EVT PtrVT = Ptr.getValueType();
@ -3188,7 +3216,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
    }
    SDValue A = DAG.getNode(ISD::ADD, dl,
                            PtrVT, Ptr,
-                            DAG.getConstant(Offsets[i], dl, PtrVT));
+                            DAG.getConstant(Offsets[i], dl, PtrVT),
+                            &Flags);
    SDValue L = DAG.getLoad(ValueVTs[i], dl, Root,
                            A, MachinePointerInfo(SV, Offsets[i]), isVolatile,
                            isNonTemporal, isInvariant, Alignment, AAInfo,
@ -3243,6 +3272,11 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
  AAMDNodes AAInfo;
  I.getAAMetadata(AAInfo);

+  // An aggregate load cannot wrap around the address space, so offsets to its
+  // parts don't wrap either.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
+
  unsigned ChainI = 0;
  for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
    // See visitLoad comments.
@ -3253,7 +3287,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
      ChainI = 0;
    }
    SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
-                              DAG.getConstant(Offsets[i], dl, PtrVT));
+                              DAG.getConstant(Offsets[i], dl, PtrVT), &Flags);
    SDValue St = DAG.getStore(Root, dl,
                              SDValue(Src.getNode(), Src.getResNo() + i),
                              Add, MachinePointerInfo(PtrV, Offsets[i]),
@ -5189,7 +5223,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
    return nullptr;
  }
  case Intrinsic::experimental_gc_relocate: {
-    visitGCRelocate(I);
+    visitGCRelocate(cast<GCRelocateInst>(I));
    return nullptr;
  }
  case Intrinsic::instrprof_increment:
@ -7202,10 +7236,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
    ReturnValues.resize(NumValues);
    SmallVector<SDValue, 4> Chains(NumValues);

+    // An aggregate return value cannot wrap around the address space, so
+    // offsets to its parts don't wrap either.
+    SDNodeFlags Flags;
+    Flags.setNoUnsignedWrap(true);
+
    for (unsigned i = 0; i < NumValues; ++i) {
      SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
                                    CLI.DAG.getConstant(Offsets[i], CLI.DL,
-                                                        PtrVT));
+                                                        PtrVT), &Flags);
      SDValue L = CLI.DAG.getLoad(
          RetTys[i], CLI.DL, CLI.Chain, Add,
          MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@ -855,7 +855,7 @@ private:

  // These three are implemented in StatepointLowering.cpp
  void visitStatepoint(const CallInst &I);
-  void visitGCRelocate(const CallInst &I);
+  void visitGCRelocate(const GCRelocateInst &I);
  void visitGCResult(const CallInst &I);

  void visitUserOp1(const Instruction &I) {
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@ -633,6 +633,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
    MRI.replaceRegWith(From, To);
  }

+  if (TLI->hasCopyImplyingStackAdjustment(MF))
+    MFI->setHasOpaqueSPAdjustment(true);
+
  // Freeze the set of reserved registers now that MachineFrameInfo has been
  // set up. All the information required by getReservedRegs() should be
  // available now.
--- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@ -128,13 +128,11 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
    return Optional<int>();

  // Spill location is known for gc relocates
-  if (isGCRelocate(Val)) {
-    GCRelocateOperands RelocOps(cast<Instruction>(Val));
-
+  if (const auto *Relocate = dyn_cast<GCRelocateInst>(Val)) {
    FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
-        Builder.FuncInfo.StatepointRelocatedValues[RelocOps.getStatepoint()];
+        Builder.FuncInfo.StatepointRelocatedValues[Relocate->getStatepoint()];

-    auto It = SpillMap.find(RelocOps.getDerivedPtr());
+    auto It = SpillMap.find(Relocate->getDerivedPtr());
    if (It == SpillMap.end())
      return Optional<int>();

@ -401,10 +399,10 @@ static void getIncomingStatepointGCValues(
    SmallVectorImpl<const Value *> &Bases, SmallVectorImpl<const Value *> &Ptrs,
    SmallVectorImpl<const Value *> &Relocs, ImmutableStatepoint StatepointSite,
    SelectionDAGBuilder &Builder) {
-  for (GCRelocateOperands relocateOpers : StatepointSite.getRelocates()) {
-    Relocs.push_back(relocateOpers.getUnderlyingCallSite().getInstruction());
-    Bases.push_back(relocateOpers.getBasePtr());
-    Ptrs.push_back(relocateOpers.getDerivedPtr());
+  for (const GCRelocateInst *Relocate : StatepointSite.getRelocates()) {
+    Relocs.push_back(Relocate);
+    Bases.push_back(Relocate->getBasePtr());
+    Ptrs.push_back(Relocate->getDerivedPtr());
  }

  // Remove any redundant llvm::Values which map to the same SDValue as another
@ -602,8 +600,8 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
  FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
    Builder.FuncInfo.StatepointRelocatedValues[StatepointInstr];

-  for (GCRelocateOperands RelocateOpers : StatepointSite.getRelocates()) {
-    const Value *V = RelocateOpers.getDerivedPtr();
+  for (const GCRelocateInst *Relocate : StatepointSite.getRelocates()) {
+    const Value *V = Relocate->getDerivedPtr();
    SDValue SDV = Builder.getValue(V);
    SDValue Loc = Builder.StatepointLowering.getLocation(SDV);

@ -624,8 +622,7 @@ static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
      // uses of the corresponding values so that it would automatically
      // export them. Relocates of the spilled values does not use original
      // value.
-      if (RelocateOpers.getUnderlyingCallSite().getParent() !=
-          StatepointInstr->getParent())
+      if (Relocate->getParent() != StatepointInstr->getParent())
        Builder.ExportFromCurrentBlock(V);
    }
  }
@ -656,7 +653,7 @@ void SelectionDAGBuilder::LowerStatepoint(
  // statepoint.
  for (const User *U : CS->users()) {
    const CallInst *Call = cast<CallInst>(U);
-    if (isGCRelocate(Call) && Call->getParent() == CS.getParent())
+    if (isa<GCRelocateInst>(Call) && Call->getParent() == CS.getParent())
      StatepointLowering.scheduleRelocCall(*Call);
  }
 #endif
@ -859,24 +856,22 @@ void SelectionDAGBuilder::visitGCResult(const CallInst &CI) {
  }
 }

-void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
-  GCRelocateOperands RelocateOpers(&CI);
-
+void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
 #ifndef NDEBUG
  // Consistency check
  // We skip this check for relocates not in the same basic block as thier
  // statepoint. It would be too expensive to preserve validation info through
  // different basic blocks.
-  if (RelocateOpers.getStatepoint()->getParent() == CI.getParent()) {
-    StatepointLowering.relocCallVisited(CI);
+  if (Relocate.getStatepoint()->getParent() == Relocate.getParent()) {
+    StatepointLowering.relocCallVisited(Relocate);
  }
 #endif

-  const Value *DerivedPtr = RelocateOpers.getDerivedPtr();
+  const Value *DerivedPtr = Relocate.getDerivedPtr();
  SDValue SD = getValue(DerivedPtr);

  FunctionLoweringInfo::StatepointSpilledValueMapTy &SpillMap =
-    FuncInfo.StatepointRelocatedValues[RelocateOpers.getStatepoint()];
+    FuncInfo.StatepointRelocatedValues[Relocate.getStatepoint()];

  // We should have recorded location for this pointer
  assert(SpillMap.count(DerivedPtr) && "Relocating not lowered gc value");
@ -885,7 +880,7 @@ void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
  // We didn't need to spill these special cases (constants and allocas).
  // See the handling in spillIncomingValueForStatepoint for detail.
  if (!DerivedPtrLocation) {
-    setValue(&CI, SD);
+    setValue(&Relocate, SD);
    return;
  }

@ -907,5 +902,5 @@ void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
  DAG.setRoot(SpillLoad.getValue(1));

  assert(SpillLoad.getNode());
-  setValue(&CI, SpillLoad);
+  setValue(&Relocate, SpillLoad);
 }
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@ -212,7 +212,7 @@ unsigned TargetSchedModel::computeOperandLatency(
      && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()
      && SchedModel.isComplete()) {
    errs() << "DefIdx " << DefIdx << " exceeds machine model writes for "
-           << *DefMI;
+           << *DefMI << " (Try with MCSchedModel.CompleteModel set to false)";
    llvm_unreachable("incomplete machine model");
  }
 #endif
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@ -17,11 +17,14 @@
 //===----------------------------------------------------------------------===//

 #include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@ -435,11 +438,12 @@ void llvm::calculateWinCXXEHStateNumbers(const Function *Fn,
  calculateStateNumbersForInvokes(Fn, FuncInfo);
 }

-static int addClrEHHandler(WinEHFuncInfo &FuncInfo, int ParentState,
-                           ClrHandlerType HandlerType, uint32_t TypeToken,
-                           const BasicBlock *Handler) {
+static int addClrEHHandler(WinEHFuncInfo &FuncInfo, int HandlerParentState,
+                           int TryParentState, ClrHandlerType HandlerType,
+                           uint32_t TypeToken, const BasicBlock *Handler) {
  ClrEHUnwindMapEntry Entry;
-  Entry.Parent = ParentState;
+  Entry.HandlerParentState = HandlerParentState;
+  Entry.TryParentState = TryParentState;
  Entry.Handler = Handler;
  Entry.HandlerType = HandlerType;
  Entry.TypeToken = TypeToken;
@ -453,82 +457,199 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
  if (!FuncInfo.EHPadStateMap.empty())
    return;

-  SmallVector<std::pair<const Instruction *, int>, 8> Worklist;
+  // This numbering assigns one state number to each catchpad and cleanuppad.
+  // It also computes two tree-like relations over states:
+  // 1) Each state has a "HandlerParentState", which is the state of the next
+  //    outer handler enclosing this state's handler (same as nearest ancestor
+  //    per the ParentPad linkage on EH pads, but skipping over catchswitches).
+  // 2) Each state has a "TryParentState", which:
+  //    a) for a catchpad that's not the last handler on its catchswitch, is
+  //       the state of the next catchpad on that catchswitch
+  //    b) for all other pads, is the state of the pad whose try region is the
+  //       next outer try region enclosing this state's try region.  The "try
+  //       regions are not present as such in the IR, but will be inferred
+  //       based on the placement of invokes and pads which reach each other
+  //       by exceptional exits
+  // Catchswitches do not get their own states, but each gets mapped to the
+  // state of its first catchpad.

-  // Each pad needs to be able to refer to its parent, so scan the function
-  // looking for top-level handlers and seed the worklist with them.
+  // Step one: walk down from outermost to innermost funclets, assigning each
+  // catchpad and cleanuppad a state number.  Add an entry to the
+  // ClrEHUnwindMap for each state, recording its HandlerParentState and
+  // handler attributes.  Record the TryParentState as well for each catchpad
+  // that's not the last on its catchswitch, but initialize all other entries'
+  // TryParentStates to a sentinel -1 value that the next pass will update.
+
+  // Seed a worklist with pads that have no parent.
+  SmallVector<std::pair<const Instruction *, int>, 8> Worklist;
  for (const BasicBlock &BB : *Fn) {
-    if (!BB.isEHPad())
-      continue;
-    if (BB.isLandingPad())
-      report_fatal_error("CoreCLR EH cannot use landingpads");
    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
-    if (!isTopLevelPadForMSVC(FirstNonPHI))
+    const Value *ParentPad;
+    if (const auto *CPI = dyn_cast<CleanupPadInst>(FirstNonPHI))
+      ParentPad = CPI->getParentPad();
+    else if (const auto *CSI = dyn_cast<CatchSwitchInst>(FirstNonPHI))
+      ParentPad = CSI->getParentPad();
+    else
      continue;
-    // queue this with sentinel parent state -1 to mean unwind to caller.
-    Worklist.emplace_back(FirstNonPHI, -1);
+    if (isa<ConstantTokenNone>(ParentPad))
+      Worklist.emplace_back(FirstNonPHI, -1);
  }

+  // Use the worklist to visit all pads, from outer to inner.  Record
+  // HandlerParentState for all pads.  Record TryParentState only for catchpads
+  // that aren't the last on their catchswitch (setting all other entries'
+  // TryParentStates to an initial value of -1).  This loop is also responsible
+  // for setting the EHPadStateMap entry for all catchpads, cleanuppads, and
+  // catchswitches.
  while (!Worklist.empty()) {
    const Instruction *Pad;
-    int ParentState;
-    std::tie(Pad, ParentState) = Worklist.pop_back_val();
+    int HandlerParentState;
+    std::tie(Pad, HandlerParentState) = Worklist.pop_back_val();

-    Value *ParentPad;
-    int PredState;
-    if (const CleanupPadInst *Cleanup = dyn_cast<CleanupPadInst>(Pad)) {
-      // A cleanup can have multiple exits; don't re-process after the first.
-      if (FuncInfo.EHPadStateMap.count(Cleanup))
-        continue;
-      // CoreCLR personality uses arity to distinguish faults from finallies.
-      const BasicBlock *PadBlock = Cleanup->getParent();
+    if (const auto *Cleanup = dyn_cast<CleanupPadInst>(Pad)) {
+      // Create the entry for this cleanup with the appropriate handler
+      // properties.  Finaly and fault handlers are distinguished by arity.
      ClrHandlerType HandlerType =
-          (Cleanup->getNumOperands() ? ClrHandlerType::Fault
-                                     : ClrHandlerType::Finally);
-      int NewState =
-          addClrEHHandler(FuncInfo, ParentState, HandlerType, 0, PadBlock);
-      FuncInfo.EHPadStateMap[Cleanup] = NewState;
-      // Propagate the new state to all preds of the cleanup
-      ParentPad = Cleanup->getParentPad();
-      PredState = NewState;
-    } else if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
-      SmallVector<const CatchPadInst *, 1> Handlers;
-      for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
-        const auto *Catch = cast<CatchPadInst>(CatchPadBB->getFirstNonPHI());
-        Handlers.push_back(Catch);
-      }
-      FuncInfo.EHPadStateMap[CatchSwitch] = ParentState;
-      int NewState = ParentState;
-      for (auto HandlerI = Handlers.rbegin(), HandlerE = Handlers.rend();
-           HandlerI != HandlerE; ++HandlerI) {
-        const CatchPadInst *Catch = *HandlerI;
-        const BasicBlock *PadBlock = Catch->getParent();
+          (Cleanup->getNumArgOperands() ? ClrHandlerType::Fault
+                                        : ClrHandlerType::Finally);
+      int CleanupState = addClrEHHandler(FuncInfo, HandlerParentState, -1,
+                                         HandlerType, 0, Pad->getParent());
+      // Queue any child EH pads on the worklist.
+      for (const User *U : Cleanup->users())
+        if (const auto *I = dyn_cast<Instruction>(U))
+          if (I->isEHPad())
+            Worklist.emplace_back(I, CleanupState);
+      // Remember this pad's state.
+      FuncInfo.EHPadStateMap[Cleanup] = CleanupState;
+    } else {
+      // Walk the handlers of this catchswitch in reverse order since all but
+      // the last need to set the following one as its TryParentState.
+      const auto *CatchSwitch = cast<CatchSwitchInst>(Pad);
+      int CatchState = -1, FollowerState = -1;
+      SmallVector<const BasicBlock *, 4> CatchBlocks(CatchSwitch->handlers());
+      for (auto CBI = CatchBlocks.rbegin(), CBE = CatchBlocks.rend();
+           CBI != CBE; ++CBI, FollowerState = CatchState) {
+        const BasicBlock *CatchBlock = *CBI;
+        // Create the entry for this catch with the appropriate handler
+        // properties.
+        const auto *Catch = cast<CatchPadInst>(CatchBlock->getFirstNonPHI());
        uint32_t TypeToken = static_cast<uint32_t>(
            cast<ConstantInt>(Catch->getArgOperand(0))->getZExtValue());
-        NewState = addClrEHHandler(FuncInfo, NewState, ClrHandlerType::Catch,
-                                   TypeToken, PadBlock);
-        FuncInfo.EHPadStateMap[Catch] = NewState;
+        CatchState =
+            addClrEHHandler(FuncInfo, HandlerParentState, FollowerState,
+                            ClrHandlerType::Catch, TypeToken, CatchBlock);
+        // Queue any child EH pads on the worklist.
+        for (const User *U : Catch->users())
+          if (const auto *I = dyn_cast<Instruction>(U))
+            if (I->isEHPad())
+              Worklist.emplace_back(I, CatchState);
+        // Remember this catch's state.
+        FuncInfo.EHPadStateMap[Catch] = CatchState;
      }
-      for (const auto *CatchPad : Handlers) {
-        for (const User *U : CatchPad->users()) {
-          const auto *UserI = cast<Instruction>(U);
-          if (UserI->isEHPad())
-            Worklist.emplace_back(UserI, ParentState);
-        }
-      }
-      PredState = NewState;
-      ParentPad = CatchSwitch->getParentPad();
-    } else {
-      llvm_unreachable("Unexpected EH pad");
-    }
-
-    // Queue all predecessors with the given state
-    for (const BasicBlock *Pred : predecessors(Pad->getParent())) {
-      if ((Pred = getEHPadFromPredecessor(Pred, ParentPad)))
-        Worklist.emplace_back(Pred->getFirstNonPHI(), PredState);
+      // Associate the catchswitch with the state of its first catch.
+      assert(CatchSwitch->getNumHandlers());
+      FuncInfo.EHPadStateMap[CatchSwitch] = CatchState;
    }
  }

+  // Step two: record the TryParentState of each state.  For cleanuppads that
+  // don't have cleanuprets, we may need to infer this from their child pads,
+  // so visit pads in descendant-most to ancestor-most order.
+  for (auto Entry = FuncInfo.ClrEHUnwindMap.rbegin(),
+            End = FuncInfo.ClrEHUnwindMap.rend();
+       Entry != End; ++Entry) {
+    const Instruction *Pad =
+        Entry->Handler.get<const BasicBlock *>()->getFirstNonPHI();
+    // For most pads, the TryParentState is the state associated with the
+    // unwind dest of exceptional exits from it.
+    const BasicBlock *UnwindDest;
+    if (const auto *Catch = dyn_cast<CatchPadInst>(Pad)) {
+      // If a catch is not the last in its catchswitch, its TryParentState is
+      // the state associated with the next catch in the switch, even though
+      // that's not the unwind dest of exceptions escaping the catch.  Those
+      // cases were already assigned a TryParentState in the first pass, so
+      // skip them.
+      if (Entry->TryParentState != -1)
+        continue;
+      // Otherwise, get the unwind dest from the catchswitch.
+      UnwindDest = Catch->getCatchSwitch()->getUnwindDest();
+    } else {
+      const auto *Cleanup = cast<CleanupPadInst>(Pad);
+      UnwindDest = nullptr;
+      for (const User *U : Cleanup->users()) {
+        if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
+          // Common and unambiguous case -- cleanupret indicates cleanup's
+          // unwind dest.
+          UnwindDest = CleanupRet->getUnwindDest();
+          break;
+        }
+
+        // Get an unwind dest for the user
+        const BasicBlock *UserUnwindDest = nullptr;
+        if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
+          UserUnwindDest = Invoke->getUnwindDest();
+        } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(U)) {
+          UserUnwindDest = CatchSwitch->getUnwindDest();
+        } else if (auto *ChildCleanup = dyn_cast<CleanupPadInst>(U)) {
+          int UserState = FuncInfo.EHPadStateMap[ChildCleanup];
+          int UserUnwindState =
+              FuncInfo.ClrEHUnwindMap[UserState].TryParentState;
+          if (UserUnwindState != -1)
+            UserUnwindDest = FuncInfo.ClrEHUnwindMap[UserUnwindState]
+                                 .Handler.get<const BasicBlock *>();
+        }
+
+        // Not having an unwind dest for this user might indicate that it
+        // doesn't unwind, so can't be taken as proof that the cleanup itself
+        // may unwind to caller (see e.g. SimplifyUnreachable and
+        // RemoveUnwindEdge).
+        if (!UserUnwindDest)
+          continue;
+
+        // Now we have an unwind dest for the user, but we need to see if it
+        // unwinds all the way out of the cleanup or if it stays within it.
+        const Instruction *UserUnwindPad = UserUnwindDest->getFirstNonPHI();
+        const Value *UserUnwindParent;
+        if (auto *CSI = dyn_cast<CatchSwitchInst>(UserUnwindPad))
+          UserUnwindParent = CSI->getParentPad();
+        else
+          UserUnwindParent =
+              cast<CleanupPadInst>(UserUnwindPad)->getParentPad();
+
+        // The unwind stays within the cleanup iff it targets a child of the
+        // cleanup.
+        if (UserUnwindParent == Cleanup)
+          continue;
+
+        // This unwind exits the cleanup, so its dest is the cleanup's dest.
+        UnwindDest = UserUnwindDest;
+        break;
+      }
+    }
+
+    // Record the state of the unwind dest as the TryParentState.
+    int UnwindDestState;
+
+    // If UnwindDest is null at this point, either the pad in question can
+    // be exited by unwind to caller, or it cannot be exited by unwind.  In
+    // either case, reporting such cases as unwinding to caller is correct.
+    // This can lead to EH tables that "look strange" -- if this pad's is in
+    // a parent funclet which has other children that do unwind to an enclosing
+    // pad, the try region for this pad will be missing the "duplicate" EH
+    // clause entries that you'd expect to see covering the whole parent.  That
+    // should be benign, since the unwind never actually happens.  If it were
+    // an issue, we could add a subsequent pass that pushes unwind dests down
+    // from parents that have them to children that appear to unwind to caller.
+    if (!UnwindDest) {
+      UnwindDestState = -1;
+    } else {
+      UnwindDestState = FuncInfo.EHPadStateMap[UnwindDest->getFirstNonPHI()];
+    }
+
+    Entry->TryParentState = UnwindDestState;
+  }
+
+  // Step three: transfer information from pads to invokes.
  calculateStateNumbersForInvokes(Fn, FuncInfo);
 }

@ -597,6 +718,11 @@ void WinEHPrepare::cloneCommonBlocks(Function &F) {
  for (auto &Funclets : FuncletBlocks) {
    BasicBlock *FuncletPadBB = Funclets.first;
    std::vector<BasicBlock *> &BlocksInFunclet = Funclets.second;
+    Value *FuncletToken;
+    if (FuncletPadBB == &F.getEntryBlock())
+      FuncletToken = ConstantTokenNone::get(F.getContext());
+    else
+      FuncletToken = FuncletPadBB->getFirstNonPHI();

    std::vector<std::pair<BasicBlock *, BasicBlock *>> Orig2Clone;
    ValueToValueMapTy VMap;
@ -668,15 +794,44 @@ void WinEHPrepare::cloneCommonBlocks(Function &F) {
        RemapInstruction(&I, VMap,
                         RF_IgnoreMissingEntries | RF_NoModuleLevelChanges);

+    // Catchrets targeting cloned blocks need to be updated separately from
+    // the loop above because they are not in the current funclet.
+    SmallVector<CatchReturnInst *, 2> FixupCatchrets;
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+
+      FixupCatchrets.clear();
+      for (BasicBlock *Pred : predecessors(OldBlock))
+        if (auto *CatchRet = dyn_cast<CatchReturnInst>(Pred->getTerminator()))
+          if (CatchRet->getParentPad() == FuncletToken)
+            FixupCatchrets.push_back(CatchRet);
+
+      for (CatchReturnInst *CatchRet : FixupCatchrets)
+        CatchRet->setSuccessor(NewBlock);
+    }
+
    auto UpdatePHIOnClonedBlock = [&](PHINode *PN, bool IsForOldBlock) {
      unsigned NumPreds = PN->getNumIncomingValues();
      for (unsigned PredIdx = 0, PredEnd = NumPreds; PredIdx != PredEnd;
           ++PredIdx) {
        BasicBlock *IncomingBlock = PN->getIncomingBlock(PredIdx);
-        ColorVector &IncomingColors = BlockColors[IncomingBlock];
-        bool BlockInFunclet = IncomingColors.size() == 1 &&
-                              IncomingColors.front() == FuncletPadBB;
-        if (IsForOldBlock != BlockInFunclet)
+        bool EdgeTargetsFunclet;
+        if (auto *CRI =
+                dyn_cast<CatchReturnInst>(IncomingBlock->getTerminator())) {
+          EdgeTargetsFunclet = (CRI->getParentPad() == FuncletToken);
+        } else {
+          ColorVector &IncomingColors = BlockColors[IncomingBlock];
+          assert(!IncomingColors.empty() && "Block not colored!");
+          assert((IncomingColors.size() == 1 ||
+                  llvm::all_of(IncomingColors,
+                               [&](BasicBlock *Color) {
+                                 return Color != FuncletPadBB;
+                               })) &&
+                 "Cloning should leave this funclet's blocks monochromatic");
+          EdgeTargetsFunclet = (IncomingColors.front() == FuncletPadBB);
+        }
+        if (IsForOldBlock != EdgeTargetsFunclet)
          continue;
        PN->removeIncomingValue(IncomingBlock, /*DeletePHIIfEmpty=*/false);
        // Revisit the next entry.
@ -864,7 +1019,6 @@ void WinEHPrepare::cleanupPreparedFunclets(Function &F) {
 }

 void WinEHPrepare::verifyPreparedFunclets(Function &F) {
-  // Recolor the CFG to verify that all is well.
  for (BasicBlock &BB : F) {
    size_t NumColors = BlockColors[&BB].size();
    assert(NumColors == 1 && "Expected monochromatic BB!");
@ -872,12 +1026,8 @@ void WinEHPrepare::verifyPreparedFunclets(Function &F) {
      report_fatal_error("Uncolored BB!");
    if (NumColors > 1)
      report_fatal_error("Multicolor BB!");
-    if (!DisableDemotion) {
-      bool EHPadHasPHI = BB.isEHPad() && isa<PHINode>(BB.begin());
-      assert(!EHPadHasPHI && "EH Pad still has a PHI!");
-      if (EHPadHasPHI)
-        report_fatal_error("EH Pad still has a PHI!");
-    }
+    assert((DisableDemotion || !(BB.isEHPad() && isa<PHINode>(BB.begin()))) &&
+           "EH Pad still has a PHI!");
  }
 }

@ -896,12 +1046,17 @@ bool WinEHPrepare::prepareExplicitEH(Function &F) {
    demotePHIsOnFunclets(F);

  if (!DisableCleanups) {
+    DEBUG(verifyFunction(F));
    removeImplausibleInstructions(F);

+    DEBUG(verifyFunction(F));
    cleanupPreparedFunclets(F);
  }

-  verifyPreparedFunclets(F);
+  DEBUG(verifyPreparedFunclets(F));
+  // Recolor the CFG to verify that all is well.
+  DEBUG(colorFunclets(F));
+  DEBUG(verifyPreparedFunclets(F));

  BlockColors.clear();
  FuncletBlocks.clear();
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@ -281,6 +281,7 @@ int FuzzerDriver(const std::vector<std::string> &Args,
  if (Flags.verbosity > 0 && !Dictionary.empty())
    Printf("Dictionary: %zd entries\n", Dictionary.size());
  Options.SaveArtifacts = !Flags.test_single_input;
+  Options.PrintNewCovPcs = Flags.print_new_cov_pcs;

  Fuzzer F(USF, Options);

--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@ -72,3 +72,5 @@ FUZZER_FLAG_STRING(exact_artifact_path,
 FUZZER_FLAG_INT(drill, 0, "Experimental: fuzz using a single unit as the seed "
                          "corpus, then merge with the initial corpus")
 FUZZER_FLAG_INT(output_csv, 0, "Enable pulse output in CSV format.")
+FUZZER_FLAG_INT(print_new_cov_pcs, 0, "If 1, print out new covered pcs.")
+
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@ -97,6 +97,7 @@ class Fuzzer {
    bool SaveArtifacts = true;
    bool PrintNEW = true;  // Print a status line when new units are found;
    bool OutputCSV = false;
+    bool PrintNewCovPcs = false;
  };
  Fuzzer(UserSuppliedFuzzer &USF, FuzzingOptions Options);
  void AddToCorpus(const Unit &U) { Corpus.push_back(U); }
@ -188,6 +189,7 @@ class Fuzzer {
  long EpochOfLastReadOfOutputCorpus = 0;
  size_t LastRecordedBlockCoverage = 0;
  size_t LastRecordedCallerCalleeCoverage = 0;
+  size_t LastCoveragePcBufferLen = 0;
 };

 class SimpleUserSuppliedFuzzer: public UserSuppliedFuzzer {
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@ -31,6 +31,8 @@ void __sanitizer_set_death_callback(void (*callback)(void));
 __attribute__((weak)) size_t __sanitizer_get_number_of_counters();
 __attribute__((weak))
 uintptr_t __sanitizer_update_counter_bitset_and_clear_counters(uint8_t *bitset);
+__attribute__((weak)) uintptr_t
+__sanitizer_get_coverage_pc_buffer(uintptr_t **data);
 }

 namespace fuzzer {
@ -249,7 +251,21 @@ void Fuzzer::ExecuteCallback(const Unit &U) {

 size_t Fuzzer::RecordBlockCoverage() {
  CHECK_WEAK_API_FUNCTION(__sanitizer_get_total_unique_coverage);
-  return LastRecordedBlockCoverage = __sanitizer_get_total_unique_coverage();
+  uintptr_t PrevCoverage = LastRecordedBlockCoverage;
+  LastRecordedBlockCoverage = __sanitizer_get_total_unique_coverage();
+
+  if (PrevCoverage == LastRecordedBlockCoverage || !Options.PrintNewCovPcs)
+    return LastRecordedBlockCoverage;
+
+  uintptr_t PrevBufferLen = LastCoveragePcBufferLen;
+  uintptr_t *CoverageBuf;
+  LastCoveragePcBufferLen = __sanitizer_get_coverage_pc_buffer(&CoverageBuf);
+  assert(CoverageBuf);
+  for (size_t i = PrevBufferLen; i < LastCoveragePcBufferLen; ++i) {
+    Printf("0x%x\n", CoverageBuf[i]);
+  }
+
+  return LastRecordedBlockCoverage;
 }

 size_t Fuzzer::RecordCallerCalleeCoverage() {
--- a/lib/Fuzzer/FuzzerMutate.cpp
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@ -117,11 +117,18 @@ size_t MutationDispatcher::Mutate_AddWordFromDictionary(uint8_t *Data,
  assert(!D.empty());
  if (D.empty()) return 0;
  const Unit &Word = D[Rand(D.size())];
-  if (Size + Word.size() > MaxSize) return 0;
-  size_t Idx = Rand(Size + 1);
-  memmove(Data + Idx + Word.size(), Data + Idx, Size - Idx);
-  memcpy(Data + Idx, Word.data(), Word.size());
-  return Size + Word.size();
+  if (Rand.RandBool()) {  // Insert Word.
+    if (Size + Word.size() > MaxSize) return 0;
+    size_t Idx = Rand(Size + 1);
+    memmove(Data + Idx + Word.size(), Data + Idx, Size - Idx);
+    memcpy(Data + Idx, Word.data(), Word.size());
+    return Size + Word.size();
+  } else {  // Overwrite some bytes with Word.
+    if (Word.size() > Size) return 0;
+    size_t Idx = Rand(Size - Word.size());
+    memcpy(Data + Idx, Word.data(), Word.size());
+    return Size;
+  }
 }

 size_t MutationDispatcher::Mutate_ChangeASCIIInteger(uint8_t *Data, size_t Size,
--- a/lib/Fuzzer/FuzzerTraceState.cpp
+++ b/lib/Fuzzer/FuzzerTraceState.cpp
@ -77,6 +77,7 @@

 #include <algorithm>
 #include <cstring>
+#include <thread>
 #include <unordered_map>

 #if !LLVM_FUZZER_SUPPORTS_DFSAN
@ -172,8 +173,13 @@ struct TraceBasedMutation {

 class TraceState {
 public:
-   TraceState(const Fuzzer::FuzzingOptions &Options, const Unit &CurrentUnit)
-       : Options(Options), CurrentUnit(CurrentUnit) {}
+  TraceState(const Fuzzer::FuzzingOptions &Options, const Unit &CurrentUnit)
+       : Options(Options), CurrentUnit(CurrentUnit) {
+    // Current trace collection is not thread-friendly and it probably
+    // does not have to be such, but at least we should not crash in presence
+    // of threads. So, just ignore all traces coming from all threads but one.
+    IsMyThread = true;
+  }

  LabelRange GetLabelRange(dfsan_label L);
  void DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
@ -213,8 +219,11 @@ class TraceState {
  LabelRange LabelRanges[1 << (sizeof(dfsan_label) * 8)];
  const Fuzzer::FuzzingOptions &Options;
  const Unit &CurrentUnit;
+  static thread_local bool IsMyThread;
 };

+thread_local bool TraceState::IsMyThread;
+
 LabelRange TraceState::GetLabelRange(dfsan_label L) {
  LabelRange &LR = LabelRanges[L];
  if (LR.Beg < LR.End || L == 0)
@ -238,7 +247,7 @@ void TraceState::DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
                                  uint64_t Arg1, uint64_t Arg2, dfsan_label L1,
                                  dfsan_label L2) {
  assert(ReallyHaveDFSan());
-  if (!RecordingTraces) return;
+  if (!RecordingTraces || !IsMyThread) return;
  if (L1 == 0 && L2 == 0)
    return;  // Not actionable.
  if (L1 != 0 && L2 != 0)
@ -267,7 +276,7 @@ void TraceState::DFSanSwitchCallback(uint64_t PC, size_t ValSizeInBits,
                                     uint64_t Val, size_t NumCases,
                                     uint64_t *Cases, dfsan_label L) {
  assert(ReallyHaveDFSan());
-  if (!RecordingTraces) return;
+  if (!RecordingTraces || !IsMyThread) return;
  if (!L) return;  // Not actionable.
  LabelRange LR = GetLabelRange(L);
  size_t ValSize = ValSizeInBits / 8;
@ -312,7 +321,7 @@ int TraceState::TryToAddDesiredData(uint64_t PresentData, uint64_t DesiredData,

 void TraceState::TraceCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
                                  uint64_t Arg1, uint64_t Arg2) {
-  if (!RecordingTraces) return;
+  if (!RecordingTraces || !IsMyThread) return;
  int Added = 0;
  if (Options.Verbosity >= 3)
    Printf("TraceCmp %zd/%zd: %p %zd %zd\n", CmpSize, CmpType, PC, Arg1, Arg2);
@ -327,7 +336,7 @@ void TraceState::TraceCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
 void TraceState::TraceSwitchCallback(uintptr_t PC, size_t ValSizeInBits,
                                     uint64_t Val, size_t NumCases,
                                     uint64_t *Cases) {
-  if (!RecordingTraces) return;
+  if (!RecordingTraces || !IsMyThread) return;
  size_t ValSize = ValSizeInBits / 8;
  bool TryShort = IsTwoByteData(Val);
  for (size_t i = 0; i < NumCases; i++)
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@ -26,6 +26,7 @@ set(Tests
  StrcmpTest
  StrncmpTest
  SwitchTest
+  ThreadedTest
  TimeoutTest
  )

--- a/lib/Fuzzer/test/ThreadedTest.cpp
+++ b/lib/Fuzzer/test/ThreadedTest.cpp
@ -0,0 +1,23 @@
+// Threaded test for a fuzzer. The fuzzer should not crash.
+#include <assert.h>
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <thread>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size < 8) return 0;
+  assert(Data);
+  auto C = [&] {
+    size_t Res = 0;
+    for (size_t i = 0; i < Size / 2; i++)
+      Res += memcmp(Data, Data + Size / 2, 4);
+    return Res;
+  };
+  std::thread T[] = {std::thread(C), std::thread(C), std::thread(C),
+                     std::thread(C), std::thread(C), std::thread(C)};
+  for (auto &X : T)
+    X.join();
+  return 0;
+}
+
--- a/lib/Fuzzer/test/fuzzer-threaded.test
+++ b/lib/Fuzzer/test/fuzzer-threaded.test
@ -0,0 +1,7 @@
+CHECK: Done 1000 runs in
+
+RUN: LLVMFuzzer-ThreadedTest -use_traces=1 -runs=1000  2>&1 | FileCheck %s
+RUN: LLVMFuzzer-ThreadedTest -use_traces=1 -runs=1000  2>&1 | FileCheck %s
+RUN: LLVMFuzzer-ThreadedTest -use_traces=1 -runs=1000  2>&1 | FileCheck %s
+RUN: LLVMFuzzer-ThreadedTest -use_traces=1 -runs=1000  2>&1 | FileCheck %s
+
--- a/lib/Fuzzer/test/fuzzer.test
+++ b/lib/Fuzzer/test/fuzzer.test
@ -30,3 +30,9 @@ RUN:     LLVMFuzzer-SimpleDictionaryTest                    -seed=1 -runs=100000

 RUN: not LLVMFuzzer-UninstrumentedTest-Uninstrumented 2>&1 | FileCheck %s --check-prefix=UNINSTRUMENTED
 UNINSTRUMENTED: ERROR: __sanitizer_set_death_callback is not defined. Exiting.
+
+RUN: LLVMFuzzer-SimpleTest -print_new_cov_pcs=1 2>&1 | FileCheck %s --check-prefix=PCS
+PCS:{{^0x[a-f0-9]+}}
+PCS:NEW
+PCS:BINGO
+
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@ -2060,7 +2060,7 @@ private:

  // printGCRelocateComment - print comment after call to the gc.relocate
  // intrinsic indicating base and derived pointer names.
-  void printGCRelocateComment(const Value &V);
+  void printGCRelocateComment(const GCRelocateInst &Relocate);
 };
 } // namespace

@ -2722,14 +2722,11 @@ void AssemblyWriter::printInstructionLine(const Instruction &I) {

 /// printGCRelocateComment - print comment after call to the gc.relocate
 /// intrinsic indicating base and derived pointer names.
-void AssemblyWriter::printGCRelocateComment(const Value &V) {
-  assert(isGCRelocate(&V));
-  GCRelocateOperands GCOps(cast<Instruction>(&V));
-
+void AssemblyWriter::printGCRelocateComment(const GCRelocateInst &Relocate) {
  Out << " ; (";
-  writeOperand(GCOps.getBasePtr(), false);
+  writeOperand(Relocate.getBasePtr(), false);
  Out << ", ";
-  writeOperand(GCOps.getDerivedPtr(), false);
+  writeOperand(Relocate.getDerivedPtr(), false);
  Out << ")";
 }

@ -2737,8 +2734,8 @@ void AssemblyWriter::printGCRelocateComment(const Value &V) {
 /// which slot it occupies.
 ///
 void AssemblyWriter::printInfoComment(const Value &V) {
-  if (isGCRelocate(&V))
-    printGCRelocateComment(V);
+  if (const auto *Relocate = dyn_cast<GCRelocateInst>(&V))
+    printGCRelocateComment(*Relocate);

  if (AnnotationWriter)
    AnnotationWriter->printInfoComment(V, Out);
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@ -641,14 +641,15 @@ AttributeSet AttributeSet::get(LLVMContext &C,
  if (Attrs.empty())
    return AttributeSet();

-#ifndef NDEBUG
-  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    assert((!i || Attrs[i-1].first <= Attrs[i].first) &&
-           "Misordered Attributes list!");
-    assert(!Attrs[i].second.hasAttribute(Attribute::None) &&
-           "Pointless attribute!");
-  }
-#endif
+  assert(std::is_sorted(Attrs.begin(), Attrs.end(),
+                        [](const std::pair<unsigned, Attribute> &LHS,
+                           const std::pair<unsigned, Attribute> &RHS) {
+                          return LHS.first < RHS.first;
+                        }) && "Misordered Attributes list!");
+  assert(std::none_of(Attrs.begin(), Attrs.end(),
+                      [](const std::pair<unsigned, Attribute> &Pair) {
+                        return Pair.second.hasAttribute(Attribute::None);
+                      }) && "Pointless attribute!");

  // Create a vector if (unsigned, AttributeSetNode*) pairs from the attributes
  // list.
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@ -76,22 +76,21 @@ iplist<Instruction>::iterator Instruction::eraseFromParent() {
  return getParent()->getInstList().erase(getIterator());
 }

-/// insertBefore - Insert an unlinked instructions into a basic block
-/// immediately before the specified instruction.
+/// Insert an unlinked instruction into a basic block immediately before the
+/// specified instruction.
 void Instruction::insertBefore(Instruction *InsertPos) {
  InsertPos->getParent()->getInstList().insert(InsertPos->getIterator(), this);
 }

-/// insertAfter - Insert an unlinked instructions into a basic block
-/// immediately after the specified instruction.
+/// Insert an unlinked instruction into a basic block immediately after the
+/// specified instruction.
 void Instruction::insertAfter(Instruction *InsertPos) {
  InsertPos->getParent()->getInstList().insertAfter(InsertPos->getIterator(),
                                                    this);
 }

-/// moveBefore - Unlink this instruction from its current basic block and
-/// insert it into the basic block that MovePos lives in, right before
-/// MovePos.
+/// Unlink this instruction from its current basic block and insert it into the
+/// basic block that MovePos lives in, right before MovePos.
 void Instruction::moveBefore(Instruction *MovePos) {
  MovePos->getParent()->getInstList().splice(
      MovePos->getIterator(), getParent()->getInstList(), getIterator());
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@ -609,20 +609,6 @@ void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
  return setSuccessor(idx, B);
 }

-bool InvokeInst::hasFnAttrImpl(Attribute::AttrKind A) const {
-  if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, A))
-    return true;
-
-  // Operand bundles override attributes on the called function, but don't
-  // override attributes directly present on the invoke instruction.
-  if (isFnAttrDisallowedByOpBundle(A))
-    return false;
-
-  if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, A);
-  return false;
-}
-
 bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const {
  assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");

@ -934,6 +920,17 @@ void CatchSwitchInst::addHandler(BasicBlock *Handler) {
  getOperandList()[OpNo] = Handler;
 }

+void CatchSwitchInst::removeHandler(handler_iterator HI) {
+  // Move all subsequent handlers up one.
+  Use *EndDst = op_end() - 1;
+  for (Use *CurDst = HI.getCurrent(); CurDst != EndDst; ++CurDst)
+    *CurDst = *(CurDst + 1);
+  // Null out the last handler use.
+  *EndDst = nullptr;
+
+  setNumHungOffUseOperands(getNumOperands() - 1);
+}
+
 BasicBlock *CatchSwitchInst::getSuccessorV(unsigned idx) const {
  return getSuccessor(idx);
 }
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@ -190,6 +190,8 @@ void ReplaceableMetadataImpl::moveRef(void *Ref, void *New,
 void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) {
  assert(!(MD && isa<MDNode>(MD) && cast<MDNode>(MD)->isTemporary()) &&
         "Expected non-temp node");
+  assert(CanReplace &&
+         "Attempted to replace Metadata marked for no replacement");

  if (UseMap.empty())
    return;
@ -555,7 +557,7 @@ void MDNode::decrementUnresolvedOperandCount() {
    resolve();
 }

-void MDNode::resolveCycles(bool MDMaterialized) {
+void MDNode::resolveCycles(bool AllowTemps) {
  if (isResolved())
    return;

@ -568,7 +570,7 @@ void MDNode::resolveCycles(bool MDMaterialized) {
    if (!N)
      continue;

-    if (N->isTemporary() && !MDMaterialized)
+    if (N->isTemporary() && AllowTemps)
      continue;
    assert(!N->isTemporary() &&
           "Expected all forward declarations to be resolved");
--- a/lib/IR/Statepoint.cpp
+++ b/lib/IR/Statepoint.cpp
@ -40,20 +40,7 @@ bool llvm::isStatepoint(const Value &inst) {
 }

 bool llvm::isGCRelocate(const ImmutableCallSite &CS) {
-  if (!CS.getInstruction()) {
-    // This is not a call site
-    return false;
-  }
-
-  return isGCRelocate(CS.getInstruction());
-}
-bool llvm::isGCRelocate(const Value *inst) {
-  if (const CallInst *call = dyn_cast<CallInst>(inst)) {
-    if (const Function *F = call->getCalledFunction()) {
-      return F->getIntrinsicID() == Intrinsic::experimental_gc_relocate;
-    }
-  }
-  return false;
+  return CS.getInstruction() && isa<GCRelocateInst>(CS.getInstruction());
 }

 bool llvm::isGCResult(const ImmutableCallSite &CS) {
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@ -1657,14 +1657,14 @@ void Verifier::VerifyStatepoint(ImmutableCallSite CS) {
    const CallInst *Call = dyn_cast<const CallInst>(U);
    Assert(Call, "illegal use of statepoint token", &CI, U);
    if (!Call) continue;
-    Assert(isGCRelocate(Call) || isGCResult(Call),
+    Assert(isa<GCRelocateInst>(Call) || isGCResult(Call),
           "gc.result or gc.relocate are the only value uses"
           "of a gc.statepoint",
           &CI, U);
    if (isGCResult(Call)) {
      Assert(Call->getArgOperand(0) == &CI,
             "gc.result connected to wrong gc.statepoint", &CI, Call);
-    } else if (isGCRelocate(Call)) {
+    } else if (isa<GCRelocateInst>(Call)) {
      Assert(Call->getArgOperand(0) == &CI,
             "gc.relocate connected to wrong gc.statepoint", &CI, Call);
    }
@ -3019,8 +3019,7 @@ void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) {
         &CPI);

  auto *ParentPad = CPI.getParentPad();
-  Assert(isa<CatchSwitchInst>(ParentPad) || isa<ConstantTokenNone>(ParentPad) ||
-             isa<CleanupPadInst>(ParentPad) || isa<CatchPadInst>(ParentPad),
+  Assert(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad),
         "CleanupPadInst has an invalid parent.", &CPI);

  User *FirstUser = nullptr;
@ -3077,10 +3076,17 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
  }

  auto *ParentPad = CatchSwitch.getParentPad();
-  Assert(isa<CatchSwitchInst>(ParentPad) || isa<ConstantTokenNone>(ParentPad) ||
-             isa<CleanupPadInst>(ParentPad) || isa<CatchPadInst>(ParentPad),
+  Assert(isa<ConstantTokenNone>(ParentPad) || isa<FuncletPadInst>(ParentPad),
         "CatchSwitchInst has an invalid parent.", ParentPad);

+  Assert(CatchSwitch.getNumHandlers() != 0,
+         "CatchSwitchInst cannot have empty handler list", &CatchSwitch);
+
+  for (BasicBlock *Handler : CatchSwitch.handlers()) {
+    Assert(isa<CatchPadInst>(Handler->getFirstNonPHI()),
+           "CatchSwitchInst handlers must be catchpads", &CatchSwitch, Handler);
+  }
+
  visitTerminatorInst(CatchSwitch);
 }

@ -3675,8 +3681,8 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {

    // Verify rest of the relocate arguments

-    GCRelocateOperands Ops(CS);
-    ImmutableCallSite StatepointCS(Ops.getStatepoint());
+    ImmutableCallSite StatepointCS(
+        cast<GCRelocateInst>(*CS.getInstruction()).getStatepoint());

    // Both the base and derived must be piped through the safepoint
    Value* Base = CS.getArgOperand(1);
@ -3731,14 +3737,14 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
    // Relocated value must be a pointer type, but gc_relocate does not need to return the
    // same pointer type as the relocated pointer. It can be casted to the correct type later
    // if it's desired. However, they must have the same address space.
-    GCRelocateOperands Operands(CS);
-    Assert(Operands.getDerivedPtr()->getType()->isPointerTy(),
+    GCRelocateInst &Relocate = cast<GCRelocateInst>(*CS.getInstruction());
+    Assert(Relocate.getDerivedPtr()->getType()->isPointerTy(),
           "gc.relocate: relocated value must be a gc pointer", CS);

    // gc_relocate return type must be a pointer type, and is verified earlier in
    // VerifyIntrinsicType().
    Assert(cast<PointerType>(CS.getType())->getAddressSpace() ==
-           cast<PointerType>(Operands.getDerivedPtr()->getType())->getAddressSpace(),
+           cast<PointerType>(Relocate.getDerivedPtr()->getType())->getAddressSpace(),
           "gc.relocate: relocating a pointer shouldn't change its address space", CS);
    break;
  }
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@ -524,6 +524,23 @@ public:
      ValueMapperFlags = ValueMapperFlags | RF_HaveUnmaterializedMetadata;
  }

+  ~IRLinker() {
+    // In the case where we are not linking metadata, we unset the CanReplace
+    // flag on all temporary metadata in the MetadataToIDs map to ensure
+    // none was replaced while being a map key. Now that we are destructing
+    // the map, set the flag back to true, so that it is replaceable during
+    // metadata linking.
+    if (!shouldLinkMetadata()) {
+      for (auto MDI : MetadataToIDs) {
+        Metadata *MD = const_cast<Metadata *>(MDI.first);
+        MDNode *Node = dyn_cast<MDNode>(MD);
+        assert((Node && Node->isTemporary()) &&
+               "Found non-temp metadata in map when not linking metadata");
+        Node->setCanReplace(true);
+      }
+    }
+  }
+
  bool run();
  Value *materializeDeclFor(Value *V, bool ForAlias);
  void materializeInitFor(GlobalValue *New, GlobalValue *Old, bool ForAlias);
@ -1111,7 +1128,8 @@ bool IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
    // a function and before remapping metadata on instructions below
    // in RemapInstruction, as the saved mapping is used to handle
    // the temporary metadata hanging off instructions.
-    SrcM.getMaterializer()->saveMetadataList(MetadataToIDs, true);
+    SrcM.getMaterializer()->saveMetadataList(MetadataToIDs,
+                                             /* OnlyTempMD = */ true);

  // Link in the prefix data.
  if (Src.hasPrefixData())
@ -1514,7 +1532,8 @@ bool IRLinker::run() {
      // Ensure metadata materialized
      if (SrcM.getMaterializer()->materializeMetadata())
        return true;
-      SrcM.getMaterializer()->saveMetadataList(MetadataToIDs, false);
+      SrcM.getMaterializer()->saveMetadataList(MetadataToIDs,
+                                               /* OnlyTempMD = */ false);
    }

    linkNamedMDNodes();
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@ -514,13 +514,13 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) {
  MCOS->EmitULEB128IntValue(1);
  MCOS->EmitULEB128IntValue(dwarf::DW_TAG_compile_unit);
  MCOS->EmitIntValue(dwarf::DW_CHILDREN_yes, 1);
-  EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list,
-             context.getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
-                                            : dwarf::DW_FORM_data4);
+  EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, context.getDwarfVersion() >= 4
+                                               ? dwarf::DW_FORM_sec_offset
+                                               : dwarf::DW_FORM_data4);
  if (context.getGenDwarfSectionSyms().size() > 1 &&
      context.getDwarfVersion() >= 3) {
-    EmitAbbrev(MCOS, dwarf::DW_AT_ranges,
-               context.getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+    EmitAbbrev(MCOS, dwarf::DW_AT_ranges, context.getDwarfVersion() >= 4
+                                              ? dwarf::DW_FORM_sec_offset
                                              : dwarf::DW_FORM_data4);
  } else {
    EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@ -1,4 +1,4 @@
-//===-- MObjectFileInfo.cpp - Object File Information ---------------------===//
+//===-- MCObjectFileInfo.cpp - Object File Information --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@ -63,31 +63,30 @@ FeatureBitset MCSubtargetInfo::ToggleFeature(const FeatureBitset &FB) {
 /// ToggleFeature - Toggle a feature and returns the re-computed feature
 /// bits. This version will also change all implied bits.
 FeatureBitset MCSubtargetInfo::ToggleFeature(StringRef FS) {
-  SubtargetFeatures Features;
-  FeatureBits = Features.ToggleFeature(FeatureBits, FS, ProcFeatures);
+  SubtargetFeatures::ToggleFeature(FeatureBits, FS, ProcFeatures);
  return FeatureBits;
 }

 FeatureBitset MCSubtargetInfo::ApplyFeatureFlag(StringRef FS) {
-  SubtargetFeatures Features;
-  FeatureBits = Features.ApplyFeatureFlag(FeatureBits, FS, ProcFeatures);
+  SubtargetFeatures::ApplyFeatureFlag(FeatureBits, FS, ProcFeatures);
  return FeatureBits;
 }

 const MCSchedModel &MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
  assert(ProcSchedModels && "Processor machine model not available!");

-  size_t NumProcs = ProcDesc.size();
-  assert(std::is_sorted(ProcSchedModels, ProcSchedModels+NumProcs,
+  ArrayRef<SubtargetInfoKV> SchedModels(ProcSchedModels, ProcDesc.size());
+
+  assert(std::is_sorted(SchedModels.begin(), SchedModels.end(),
                    [](const SubtargetInfoKV &LHS, const SubtargetInfoKV &RHS) {
                      return strcmp(LHS.Key, RHS.Key) < 0;
                    }) &&
         "Processor machine model table is not sorted");

  // Find entry
-  const SubtargetInfoKV *Found =
-    std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, CPU);
-  if (Found == ProcSchedModels+NumProcs || StringRef(Found->Key) != CPU) {
+  auto Found =
+    std::lower_bound(SchedModels.begin(), SchedModels.end(), CPU);
+  if (Found == SchedModels.end() || StringRef(Found->Key) != CPU) {
    if (CPU != "help") // Don't error if the user asked for help.
      errs() << "'" << CPU
             << "' is not a recognized processor for this target"
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@ -160,10 +160,9 @@ void ClearImpliedBits(FeatureBitset &Bits,
  }
 }

-/// ToggleFeature - Toggle a feature and returns the newly updated feature
-/// bits.
-FeatureBitset
-SubtargetFeatures::ToggleFeature(FeatureBitset Bits, StringRef Feature,
+/// ToggleFeature - Toggle a feature and update the feature bits.
+void
+SubtargetFeatures::ToggleFeature(FeatureBitset &Bits, StringRef Feature,
                                 ArrayRef<SubtargetFeatureKV> FeatureTable) {

  // Find feature in table.
@ -186,12 +185,9 @@ SubtargetFeatures::ToggleFeature(FeatureBitset Bits, StringRef Feature,
           << "' is not a recognized feature for this target"
           << " (ignoring feature)\n";
  }
-
-  return Bits;
 }

-FeatureBitset
-SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature,
+void SubtargetFeatures::ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
                                    ArrayRef<SubtargetFeatureKV> FeatureTable) {

  assert(hasFlag(Feature));
@ -203,7 +199,7 @@ SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature,
  if (FeatureEntry) {
    // Enable/disable feature in bits
    if (isEnabled(Feature)) {
-      Bits |=  FeatureEntry->Value;
+      Bits |= FeatureEntry->Value;

      // For each feature that this implies, set it.
      SetImpliedBits(Bits, FeatureEntry, FeatureTable);
@ -218,8 +214,6 @@ SubtargetFeatures::ApplyFeatureFlag(FeatureBitset Bits, StringRef Feature,
           << "' is not a recognized feature for this target"
           << " (ignoring feature)\n";
  }
-
-  return Bits;
 }


@ -234,14 +228,10 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
    return FeatureBitset();

 #ifndef NDEBUG
-  for (size_t i = 1, e = CPUTable.size(); i != e; ++i) {
-    assert(strcmp(CPUTable[i - 1].Key, CPUTable[i].Key) < 0 &&
-           "CPU table is not sorted");
-  }
-  for (size_t i = 1, e = FeatureTable.size(); i != e; ++i) {
-    assert(strcmp(FeatureTable[i - 1].Key, FeatureTable[i].Key) < 0 &&
-          "CPU features table is not sorted");
-  }
+  assert(std::is_sorted(std::begin(CPUTable), std::end(CPUTable)) &&
+         "CPU table is not sorted");
+  assert(std::is_sorted(std::begin(FeatureTable), std::end(FeatureTable)) &&
+         "CPU features table is not sorted");
 #endif
  // Resulting bits
  FeatureBitset Bits;
@ -277,7 +267,7 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
    if (Feature == "+help")
      Help(CPUTable, FeatureTable);

-    Bits = ApplyFeatureFlag(Bits, Feature, FeatureTable);
+    ApplyFeatureFlag(Bits, Feature, FeatureTable);
  }

  return Bits;
--- a/lib/ProfileData/CoverageMappingReader.cpp
+++ b/lib/ProfileData/CoverageMappingReader.cpp
@ -316,12 +316,17 @@ static std::error_code readCoverageMappingData(

  // Read the records in the coverage data section.
  for (const char *Buf = Data.data(), *End = Buf + Data.size(); Buf < End;) {
-    if (Buf + 4 * sizeof(uint32_t) > End)
+    if (Buf + sizeof(CovMapHeader) > End)
      return coveragemap_error::malformed;
-    uint32_t NRecords = endian::readNext<uint32_t, Endian, unaligned>(Buf);
-    uint32_t FilenamesSize = endian::readNext<uint32_t, Endian, unaligned>(Buf);
-    uint32_t CoverageSize = endian::readNext<uint32_t, Endian, unaligned>(Buf);
-    uint32_t Version = endian::readNext<uint32_t, Endian, unaligned>(Buf);
+    auto CovHeader = reinterpret_cast<const coverage::CovMapHeader *>(Buf);
+    uint32_t NRecords =
+        endian::byte_swap<uint32_t, Endian>(CovHeader->NRecords);
+    uint32_t FilenamesSize =
+        endian::byte_swap<uint32_t, Endian>(CovHeader->FilenamesSize);
+    uint32_t CoverageSize =
+        endian::byte_swap<uint32_t, Endian>(CovHeader->CoverageSize);
+    uint32_t Version = endian::byte_swap<uint32_t, Endian>(CovHeader->Version);
+    Buf = reinterpret_cast<const char *>(++CovHeader);

    switch (Version) {
    case CoverageMappingVersion1:
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@ -12,12 +12,15 @@
 //
 //===----------------------------------------------------------------------===//

+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Compression.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/ManagedStatic.h"

 using namespace llvm;
@ -162,6 +165,98 @@ GlobalVariable *createPGOFuncNameVar(Function &F, StringRef FuncName) {
  return createPGOFuncNameVar(*F.getParent(), F.getLinkage(), FuncName);
 }

+int collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
+                              bool doCompression, std::string &Result) {
+  uint8_t Header[16], *P = Header;
+  std::string UncompressedNameStrings =
+      join(NameStrs.begin(), NameStrs.end(), StringRef(" "));
+
+  unsigned EncLen = encodeULEB128(UncompressedNameStrings.length(), P);
+  P += EncLen;
+
+  auto WriteStringToResult = [&](size_t CompressedLen,
+                                 const std::string &InputStr) {
+    EncLen = encodeULEB128(CompressedLen, P);
+    P += EncLen;
+    char *HeaderStr = reinterpret_cast<char *>(&Header[0]);
+    unsigned HeaderLen = P - &Header[0];
+    Result.append(HeaderStr, HeaderLen);
+    Result += InputStr;
+    return 0;
+  };
+
+  if (!doCompression)
+    return WriteStringToResult(0, UncompressedNameStrings);
+
+  SmallVector<char, 128> CompressedNameStrings;
+  zlib::Status Success =
+      zlib::compress(StringRef(UncompressedNameStrings), CompressedNameStrings,
+                     zlib::BestSizeCompression);
+
+  if (Success != zlib::StatusOK)
+    return 1;
+
+  return WriteStringToResult(
+      CompressedNameStrings.size(),
+      std::string(CompressedNameStrings.data(), CompressedNameStrings.size()));
+}
+
+StringRef getPGOFuncNameInitializer(GlobalVariable *NameVar) {
+  auto *Arr = cast<ConstantDataArray>(NameVar->getInitializer());
+  StringRef NameStr =
+      Arr->isCString() ? Arr->getAsCString() : Arr->getAsString();
+  return NameStr;
+}
+
+int collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars,
+                              std::string &Result) {
+  std::vector<std::string> NameStrs;
+  for (auto *NameVar : NameVars) {
+    NameStrs.push_back(getPGOFuncNameInitializer(NameVar));
+  }
+  return collectPGOFuncNameStrings(NameStrs, zlib::isAvailable(), Result);
+}
+
+int readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
+  const uint8_t *P = reinterpret_cast<const uint8_t *>(NameStrings.data());
+  const uint8_t *EndP = reinterpret_cast<const uint8_t *>(NameStrings.data() +
+                                                          NameStrings.size());
+  while (P < EndP) {
+    uint32_t N;
+    uint64_t UncompressedSize = decodeULEB128(P, &N);
+    P += N;
+    uint64_t CompressedSize = decodeULEB128(P, &N);
+    P += N;
+    bool isCompressed = (CompressedSize != 0);
+    SmallString<128> UncompressedNameStrings;
+    StringRef NameStrings;
+    if (isCompressed) {
+      StringRef CompressedNameStrings(reinterpret_cast<const char *>(P),
+                                      CompressedSize);
+      if (zlib::uncompress(CompressedNameStrings, UncompressedNameStrings,
+                           UncompressedSize) != zlib::StatusOK)
+        return 1;
+      P += CompressedSize;
+      NameStrings = StringRef(UncompressedNameStrings.data(),
+                              UncompressedNameStrings.size());
+    } else {
+      NameStrings =
+          StringRef(reinterpret_cast<const char *>(P), UncompressedSize);
+      P += UncompressedSize;
+    }
+    // Now parse the name strings.
+    SmallVector<StringRef, 0> Names;
+    NameStrings.split(Names, ' ');
+    for (StringRef &Name : Names)
+      Symtab.addFuncName(Name);
+
+    while (P < EndP && *P == 0)
+      P++;
+  }
+  Symtab.finalizeSymtab();
+  return 0;
+}
+
 instrprof_error
 InstrProfValueSiteRecord::mergeValueData(InstrProfValueSiteRecord &Input,
                                         uint64_t Weight) {
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@ -446,7 +446,7 @@ llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
  return EC;
 }

-bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
+bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<const char*> Args) {
  static long ArgMax = sysconf(_SC_ARG_MAX);

  // System says no practical limit.
@ -456,7 +456,7 @@ bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
  // Conservatively account for space required by environment variables.
  long HalfArgMax = ArgMax / 2;

-  size_t ArgLength = 0;
+  size_t ArgLength = Program.size() + 1;
  for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end();
       I != E; ++I) {
    ArgLength += strlen(*I) + 1;
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@ -535,14 +535,15 @@ llvm::sys::writeFileWithEncoding(StringRef FileName, StringRef Contents,
  return EC;
 }

-bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
+bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<const char*> Args) {
  // The documented max length of the command line passed to CreateProcess.
  static const size_t MaxCommandStringLength = 32768;
-  size_t ArgLength = 0;
+  // Account for the trailing space for the program path and the
+  // trailing NULL of the last argument.
+  size_t ArgLength = ArgLenWithQuotes(Program.str().c_str()) + 2;
  for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end();
       I != E; ++I) {
-    // Account for the trailing space for every arg but the last one and the
-    // trailing NULL of the last argument.
+    // Account for the trailing space for every arg
    ArgLength += ArgLenWithQuotes(*I) + 1;
    if (ArgLength > MaxCommandStringLength) {
      return false;
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@ -30,6 +30,9 @@
 #define _WIN32_WINNT 0x0601
 #define _WIN32_IE    0x0800 // MinGW at it again. FIXME: verify if still needed.
 #define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif

 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@ -44,6 +47,21 @@
 #include <string>
 #include <vector>

+#if !defined(__CYGWIN__) && !defined(__MINGW32__)
+#include <VersionHelpers.h>
+#else
+// Cygwin does not have the IsWindows8OrGreater() API.
+// Some version of mingw does not have the API either.
+inline bool IsWindows8OrGreater() {
+  OSVERSIONINFO osvi = {};
+  osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+  if (!::GetVersionEx(&osvi))
+    return false;
+  return (osvi.dwMajorVersion > 6 ||
+          (osvi.dwMajorVersion == 6 && osvi.dwMinorVersion >= 2));
+}
+#endif // __CYGWIN__
+
 inline bool MakeErrMsg(std::string* ErrMsg, const std::string& prefix) {
  if (!ErrMsg)
    return true;
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@ -57,6 +57,10 @@
 #endif
 #endif

+#ifdef LLVM_ON_WIN32
+#include "Windows/WindowsSupport.h"
+#endif
+
 using namespace llvm;

 raw_ostream::~raw_ostream() {
@ -567,8 +571,21 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
  assert(FD >= 0 && "File already closed.");
  pos += Size;

+#ifndef LLVM_ON_WIN32
+  bool ShouldWriteInChunks = false;
+#else
+  // Writing a large size of output to Windows console returns ENOMEM. It seems
+  // that, prior to Windows 8, WriteFile() is redirecting to WriteConsole(), and
+  // the latter has a size limit (66000 bytes or less, depending on heap usage).
+  bool ShouldWriteInChunks = !!::_isatty(FD) && !IsWindows8OrGreater();
+#endif
+
  do {
-    ssize_t ret = ::write(FD, Ptr, Size);
+    size_t ChunkSize = Size;
+    if (ChunkSize > 32767 && ShouldWriteInChunks)
+        ChunkSize = 32767;
+
+    ssize_t ret = ::write(FD, Ptr, ChunkSize);

    if (ret < 0) {
      // If it's a recoverable error, swallow it and retry the write.
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@ -722,7 +722,7 @@ Init *UnOpInit::resolveReferences(Record &R, const RecordVal *RV) const {

 std::string UnOpInit::getAsString() const {
  std::string Result;
-  switch (Opc) {
+  switch (getOpcode()) {
  case CAST: Result = "!cast<" + getType()->getAsString() + ">"; break;
  case HEAD: Result = "!head"; break;
  case TAIL: Result = "!tail"; break;
@ -850,7 +850,7 @@ Init *BinOpInit::resolveReferences(Record &R, const RecordVal *RV) const {

 std::string BinOpInit::getAsString() const {
  std::string Result;
-  switch (Opc) {
+  switch (getOpcode()) {
  case CONCAT: Result = "!con"; break;
  case ADD: Result = "!add"; break;
  case AND: Result = "!and"; break;
@ -1054,7 +1054,7 @@ Init *TernOpInit::resolveReferences(Record &R,
                                    const RecordVal *RV) const {
  Init *lhs = LHS->resolveReferences(R, RV);

-  if (Opc == IF && lhs != LHS) {
+  if (getOpcode() == IF && lhs != LHS) {
    IntInit *Value = dyn_cast<IntInit>(lhs);
    if (Init *I = lhs->convertInitializerTo(IntRecTy::get()))
      Value = dyn_cast<IntInit>(I);
@ -1082,7 +1082,7 @@ Init *TernOpInit::resolveReferences(Record &R,

 std::string TernOpInit::getAsString() const {
  std::string Result;
-  switch (Opc) {
+  switch (getOpcode()) {
  case SUBST: Result = "!subst"; break;
  case FOREACH: Result = "!foreach"; break;
  case IF: Result = "!if"; break;
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@ -77,7 +77,8 @@ bool TGParser::AddValue(Record *CurRec, SMLoc Loc, const RecordVal &RV) {
 /// SetValue -
 /// Return true on error, false on success.
 bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
-                        const std::vector<unsigned> &BitList, Init *V) {
+                        ArrayRef<unsigned> BitList, Init *V,
+                        bool AllowSelfAssignment) {
  if (!V) return false;

  if (!CurRec) CurRec = &CurMultiClass->Rec;
@ -91,8 +92,8 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
  // in the resolution machinery.
  if (BitList.empty())
    if (VarInit *VI = dyn_cast<VarInit>(V))
-      if (VI->getNameInit() == ValName)
-        return false;
+      if (VI->getNameInit() == ValName && !AllowSelfAssignment)
+        return true;

  // If we are assigning to a subset of the bits in the value... then we must be
  // assigning to a field of BitsRecTy, which must have a BitsInit
@ -165,7 +166,7 @@ bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
    if (i < SubClass.TemplateArgs.size()) {
      // If a value is specified for this template arg, set it now.
      if (SetValue(CurRec, SubClass.RefRange.Start, TArgs[i],
-                   std::vector<unsigned>(), SubClass.TemplateArgs[i]))
+                   None, SubClass.TemplateArgs[i]))
        return true;

      // Resolve it next.
@ -243,8 +244,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
      // If a value is specified for this template arg, set it in the
      // superclass now.
      if (SetValue(CurRec, SubMultiClass.RefRange.Start, SMCTArgs[i],
-                   std::vector<unsigned>(),
-                   SubMultiClass.TemplateArgs[i]))
+                   None, SubMultiClass.TemplateArgs[i]))
        return true;

      // Resolve it next.
@ -258,8 +258,7 @@ bool TGParser::AddSubMultiClass(MultiClass *CurMC,
      for (const auto &Def :
             makeArrayRef(CurMC->DefPrototypes).slice(newDefStart)) {
        if (SetValue(Def.get(), SubMultiClass.RefRange.Start, SMCTArgs[i],
-                     std::vector<unsigned>(),
-                     SubMultiClass.TemplateArgs[i]))
+                     None, SubMultiClass.TemplateArgs[i]))
          return true;

        // Resolve it next.
@ -332,8 +331,7 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){

    IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false));

-    if (SetValue(IterRec.get(), Loc, IterVar->getName(),
-                 std::vector<unsigned>(), IVal))
+    if (SetValue(IterRec.get(), Loc, IterVar->getName(), None, IVal))
      return Error(Loc, "when instantiating this def");

    // Resolve it next.
@ -1728,7 +1726,7 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
    SMLoc ValLoc = Lex.getLoc();
    Init *Val = ParseValue(CurRec, Type);
    if (!Val ||
-        SetValue(CurRec, ValLoc, DeclName, std::vector<unsigned>(), Val))
+        SetValue(CurRec, ValLoc, DeclName, None, Val))
      // Return the name, even if an error is thrown.  This is so that we can
      // continue to make some progress, even without the value having been
      // initialized.
@ -2358,8 +2356,8 @@ Record *TGParser::InstantiateMulticlassDef(MultiClass &MC, Record *DefProto,
  // Set the value for NAME. We don't resolve references to it 'til later,
  // though, so that uses in nested multiclass names don't get
  // confused.
-  if (SetValue(CurRec.get(), Ref.RefRange.Start, "NAME",
-               std::vector<unsigned>(), DefmPrefix)) {
+  if (SetValue(CurRec.get(), Ref.RefRange.Start, "NAME", None, DefmPrefix,
+               /*AllowSelfAssignment*/true)) {
    Error(DefmPrefixRange.Start, "Could not resolve " +
          CurRec->getNameInitAsString() + ":NAME to '" +
          DefmPrefix->getAsUnquotedString() + "'");
@ -2446,8 +2444,7 @@ bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC, Record *CurRec,
    // Check if a value is specified for this temp-arg.
    if (i < TemplateVals.size()) {
      // Set it now.
-      if (SetValue(CurRec, DefmPrefixLoc, TArgs[i], std::vector<unsigned>(),
-                   TemplateVals[i]))
+      if (SetValue(CurRec, DefmPrefixLoc, TArgs[i], None, TemplateVals[i]))
        return true;

      // Resolve it next.
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h
@ -105,10 +105,13 @@ public:
 private:  // Semantic analysis methods.
  bool AddValue(Record *TheRec, SMLoc Loc, const RecordVal &RV);
  bool SetValue(Record *TheRec, SMLoc Loc, Init *ValName,
-                const std::vector<unsigned> &BitList, Init *V);
+                ArrayRef<unsigned> BitList, Init *V,
+                bool AllowSelfAssignment = false);
  bool SetValue(Record *TheRec, SMLoc Loc, const std::string &ValName,
-                const std::vector<unsigned> &BitList, Init *V) {
-    return SetValue(TheRec, Loc, StringInit::get(ValName), BitList, V);
+                ArrayRef<unsigned> BitList, Init *V,
+                bool AllowSelfAssignment = false) {
+    return SetValue(TheRec, Loc, StringInit::get(ValName), BitList, V,
+                    AllowSelfAssignment);
  }
  bool AddSubClass(Record *Rec, SubClassReference &SubClass);
  bool AddSubMultiClass(MultiClass *CurMC,
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@ -124,6 +124,14 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
                                   FeaturePerfMon,
                                   FeatureZCRegMove, FeatureZCZeroing]>;

+def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
+                                    "Samsung Exynos-M1 processors",
+                                    [FeatureFPARMv8,
+                                    FeatureNEON,
+                                    FeatureCrypto,
+                                    FeatureCRC,
+                                    FeaturePerfMon]>;
+
 def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
                                              FeatureNEON,
                                              FeatureCRC,
@ -136,6 +144,8 @@ def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
 // FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
 def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
+// FIXME: Exynos-M1 is currently modelled without a specific SchedModel.
+def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>;

 //===----------------------------------------------------------------------===//
 // Assembly parser
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@ -158,7 +158,7 @@ INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE,
                    "AArch64 A57 FP Load-Balancing", false, false)

 namespace {
-/// A Chain is a sequence of instructions that are linked together by 
+/// A Chain is a sequence of instructions that are linked together by
 /// an accumulation operand. For example:
 ///
 ///   fmul d0<def>, ?
@ -285,7 +285,7 @@ public:
  std::string str() const {
    std::string S;
    raw_string_ostream OS(S);
-    
+
    OS << "{";
    StartInst->print(OS, /* SkipOpers= */true);
    OS << " -> ";
@ -427,7 +427,7 @@ Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor,
      return Ch;
    }
  }
-  
+
  // Bailout case - just return the first item.
  Chain *Ch = L.front();
  L.erase(L.begin());
@ -495,7 +495,7 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
  RS.enterBasicBlock(&MBB);
  RS.forward(MachineBasicBlock::iterator(G->getStart()));

-  // Can we find an appropriate register that is available throughout the life 
+  // Can we find an appropriate register that is available throughout the life
  // of the chain?
  unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
  BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -2426,7 +2426,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(

      continue;
    }
-    
+
    if (VA.isRegLoc()) {
      // Arguments stored in registers.
      EVT RegVT = VA.getLocVT();
@ -5074,7 +5074,7 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,

  // The index of an EXT is the first element if it is not UNDEF.
  // Watch out for the beginning UNDEFs. The EXT index should be the expected
-  // value of the first element.  E.g. 
+  // value of the first element.  E.g.
  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
  // ExpectedElt is the last mask index plus 1.
@ -9491,6 +9491,103 @@ static SDValue performBRCONDCombine(SDNode *N,
  return SDValue();
 }

+// Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
+// as well as whether the test should be inverted.  This code is required to
+// catch these cases (as opposed to standard dag combines) because
+// AArch64ISD::TBZ is matched during legalization.
+static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
+                                 SelectionDAG &DAG) {
+
+  if (!Op->hasOneUse())
+    return Op;
+
+  // We don't handle undef/constant-fold cases below, as they should have
+  // already been taken care of (e.g. and of 0, test of undefined shifted bits,
+  // etc.)
+
+  // (tbz (trunc x), b) -> (tbz x, b)
+  // This case is just here to enable more of the below cases to be caught.
+  if (Op->getOpcode() == ISD::TRUNCATE &&
+      Bit < Op->getValueType(0).getSizeInBits()) {
+    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+  }
+
+  if (Op->getNumOperands() != 2)
+    return Op;
+
+  auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+  if (!C)
+    return Op;
+
+  switch (Op->getOpcode()) {
+  default:
+    return Op;
+
+  // (tbz (and x, m), b) -> (tbz x, b)
+  case ISD::AND:
+    if ((C->getZExtValue() >> Bit) & 1)
+      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+    return Op;
+
+  // (tbz (shl x, c), b) -> (tbz x, b-c)
+  case ISD::SHL:
+    if (C->getZExtValue() <= Bit &&
+        (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
+      Bit = Bit - C->getZExtValue();
+      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+    }
+    return Op;
+
+  // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
+  case ISD::SRA:
+    Bit = Bit + C->getZExtValue();
+    if (Bit >= Op->getValueType(0).getSizeInBits())
+      Bit = Op->getValueType(0).getSizeInBits() - 1;
+    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+
+  // (tbz (srl x, c), b) -> (tbz x, b+c)
+  case ISD::SRL:
+    if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
+      Bit = Bit + C->getZExtValue();
+      return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+    }
+    return Op;
+
+  // (tbz (xor x, -1), b) -> (tbnz x, b)
+  case ISD::XOR:
+    if ((C->getZExtValue() >> Bit) & 1)
+      Invert = !Invert;
+    return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
+  }
+}
+
+// Optimize test single bit zero/non-zero and branch.
+static SDValue performTBZCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 SelectionDAG &DAG) {
+  unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  bool Invert = false;
+  SDValue TestSrc = N->getOperand(1);
+  SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
+
+  if (TestSrc == NewTestSrc)
+    return SDValue();
+
+  unsigned NewOpc = N->getOpcode();
+  if (Invert) {
+    if (NewOpc == AArch64ISD::TBZ)
+      NewOpc = AArch64ISD::TBNZ;
+    else {
+      assert(NewOpc == AArch64ISD::TBNZ);
+      NewOpc = AArch64ISD::TBZ;
+    }
+  }
+
+  SDLoc DL(N);
+  return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
+                     DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
+}
+
 // vselect (v1i1 setcc) ->
 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
@ -9642,6 +9739,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
    return performSTORECombine(N, DCI, DAG, Subtarget);
  case AArch64ISD::BRCOND:
    return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::TBNZ:
+  case AArch64ISD::TBZ:
+    return performTBZCombine(N, DCI, DAG);
  case AArch64ISD::CSEL:
    return performCONDCombine(N, DCI, DAG, 2, 3);
  case AArch64ISD::DUP:
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@ -613,21 +613,6 @@ static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
         (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
 }

-// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
-static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
-                                   MachineInstr *Op1) {
-  assert(MI->memoperands_empty() && "expected a new machineinstr");
-  size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) +
-                      (Op1->memoperands_end() - Op1->memoperands_begin());
-
-  MachineFunction *MF = MI->getParent()->getParent();
-  MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
-  MachineSDNode::mmo_iterator MemEnd =
-      std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
-  MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
-  MI->setMemRefs(MemBegin, MemEnd);
-}
-
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
                                      MachineBasicBlock::iterator Paired,
@ -692,10 +677,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
                       TII->get(NewOpc))
                   .addOperand(getLdStRegOp(RtNewDest))
                   .addOperand(BaseRegOp)
-                   .addImm(OffsetImm);
-
-    // Copy MachineMemOperands from the original loads.
-    concatenateMemOperands(NewMemMI, I, Paired);
+                   .addImm(OffsetImm)
+                   .setMemRefs(I->mergeMemRefsWith(*Paired));

    DEBUG(
        dbgs()
@ -786,9 +769,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
                  TII->get(NewOpc))
              .addOperand(getLdStRegOp(I))
              .addOperand(BaseRegOp)
-              .addImm(OffsetImm);
-    // Copy MachineMemOperands from the original stores.
-    concatenateMemOperands(MIB, I, Paired);
+              .addImm(OffsetImm)
+              .setMemRefs(I->mergeMemRefsWith(*Paired));
  } else {
    // Handle Unscaled
    if (IsUnscaled)
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@ -33,7 +33,14 @@ class Triple;

 class AArch64Subtarget : public AArch64GenSubtargetInfo {
 protected:
-  enum ARMProcFamilyEnum {Others, CortexA35, CortexA53, CortexA57, Cyclone};
+  enum ARMProcFamilyEnum {
+    Others,
+    CortexA35,
+    CortexA53,
+    CortexA57,
+    Cyclone,
+    ExynosM1
+  };

  /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
  ARMProcFamilyEnum ARMProcFamily;
@ -143,6 +150,7 @@ public:
  bool isCyclone() const { return CPUString == "cyclone"; }
  bool isCortexA57() const { return CPUString == "cortex-a57"; }
  bool isCortexA53() const { return CPUString == "cortex-a53"; }
+  bool isExynosM1() const { return CPUString == "exynos-m1"; }

  bool useAA() const override { return isCortexA53(); }

--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@ -834,7 +834,7 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings
 };

 uint32_t
-AArch64SysReg::SysRegMapper::fromString(StringRef Name, 
+AArch64SysReg::SysRegMapper::fromString(StringRef Name,
    const FeatureBitset& FeatureBits, bool &Valid) const {
  std::string NameLower = Name.lower();

@ -878,7 +878,7 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name,
 }

 std::string
-AArch64SysReg::SysRegMapper::toString(uint32_t Bits, 
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits,
                                      const FeatureBitset& FeatureBits) const {
  // First search the registers shared by all
  for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) {
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@ -285,17 +285,17 @@ struct AArch64NamedImmMapper {
    // Zero value of FeatureBitSet means the mapping is always available
    FeatureBitset FeatureBitSet;

-    bool isNameEqual(std::string Other, 
+    bool isNameEqual(std::string Other,
                     const FeatureBitset& FeatureBits) const {
-      if (FeatureBitSet.any() && 
+      if (FeatureBitSet.any() &&
          (FeatureBitSet & FeatureBits).none())
        return false;
      return Name == Other;
    }

-    bool isValueEqual(uint32_t Other, 
+    bool isValueEqual(uint32_t Other,
                      const FeatureBitset& FeatureBits) const {
-      if (FeatureBitSet.any() && 
+      if (FeatureBitSet.any() &&
          (FeatureBitSet & FeatureBits).none())
        return false;
      return Value == Other;
@ -310,7 +310,7 @@ struct AArch64NamedImmMapper {
  StringRef toString(uint32_t Value, const FeatureBitset& FeatureBits,
                     bool &Valid) const;
  // Maps string to value, depending on availability for FeatureBits given
-  uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits, 
+  uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits,
                     bool &Valid) const;

  /// Many of the instructions allow an alternative assembly form consisting of
@ -1322,7 +1322,7 @@ namespace AArch64TLBI {
      return true;
    }
  }
-} 
+}

 namespace AArch64II {
  /// Target Operand Flag enum.
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@ -118,6 +118,11 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
        "true",
        "Support flat address space">;

+def FeatureXNACK : SubtargetFeature<"xnack",
+        "EnableXNACK",
+        "true",
+        "Enable XNACK support">;
+
 def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
        "EnableVGPRSpilling",
        "true",
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@ -417,13 +417,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
    }
  }

-  if (VCCUsed || FlatUsed)
+  if (VCCUsed || FlatUsed || STM.isXNACKEnabled()) {
    MaxSGPR += 2;

-  if (FlatUsed) {
-    MaxSGPR += 2;
-    // 2 additional for VI+.
-    if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    if (FlatUsed)
+      MaxSGPR += 2;
+
+    if (STM.isXNACKEnabled())
      MaxSGPR += 2;
  }

@ -620,6 +620,9 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
  if (MFI->hasDispatchPtr())
    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;

+  if (STM.isXNACKEnabled())
+    header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
+
  header.kernarg_segment_byte_size = MFI->ABIArgOffset;
  header.wavefront_sgpr_count = KernelInfo.NumSGPR;
  header.workitem_vgpr_count = KernelInfo.NumVGPR;
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@ -204,14 +204,6 @@ def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;

-def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
 def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
@ -243,14 +235,6 @@ def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;

-def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
 def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
@ -299,16 +283,6 @@ def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
  return isGlobalStore(dyn_cast<StoreSDNode>(N));
 }]>;

-def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei8 node:$val, node:$ptr), [{
-  return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
-def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
-                                  (truncstorei16 node:$val, node:$ptr), [{
-  return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
 def local_store : PatFrag<(ops node:$val, node:$ptr),
                             (store node:$val, node:$ptr), [{
  return isLocalStore(dyn_cast<StoreSDNode>(N));
@ -385,15 +359,6 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {

 defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>;

-def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isFlatLoad(dyn_cast<LoadSDNode>(N));
-}]>;
-
-def flat_store : PatFrag<(ops node:$val, node:$ptr),
-                         (store node:$val, node:$ptr), [{
-  return isFlatStore(dyn_cast<StoreSDNode>(N));
-}]>;
-
 def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@ -73,6 +73,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
      CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false),
      EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true),
      EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false),
+      EnableXNACK(false),
      WavefrontSize(0), CFALUBug(false), LocalMemorySize(0),
      EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
      GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@ -76,6 +76,7 @@ private:
  bool EnableIfCvt;
  bool EnableLoadStoreOpt;
  bool EnableUnsafeDSOffsetFolding;
+  bool EnableXNACK;
  unsigned WavefrontSize;
  bool CFALUBug;
  int LocalMemorySize;
@ -290,6 +291,10 @@ public:
  }
  bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const;

+  bool isXNACKEnabled() const {
+    return EnableXNACK;
+  }
+
  unsigned getMaxWavesPerCU() const {
    if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
      return 10;
--- a/lib/Target/AMDGPU/CIInstructions.td
+++ b/lib/Target/AMDGPU/CIInstructions.td
@ -264,42 +264,6 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <

 } // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst

-//===----------------------------------------------------------------------===//
-// Flat Patterns
-//===----------------------------------------------------------------------===//
-
-let Predicates = [HasFlatAddressSpace] in {
-
-class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
-                             PatFrag flat_ld> :
-  Pat <(vt (flat_ld i64:$ptr)),
-       (Instr_ADDR64 $ptr, 0, 0, 0)
->;
-
-def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
-def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
-
-class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
-  Pat <(st vt:$value, i64:$ptr),
-        (Instr $value, $ptr, 0, 0, 0)
-  >;
-
-def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
-def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
-def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
-def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
-
-} // End HasFlatAddressSpace predicate
-
 let Predicates = [isCI] in {

 // Convert (x - floor(x)) to fract(x)
@ -320,20 +284,10 @@ def : Pat <


 //===----------------------------------------------------------------------===//
-// Patterns to generate flat for global
+// Flat Patterns
 //===----------------------------------------------------------------------===//

-def useFlatForGlobal : Predicate <
-  "Subtarget->useFlatForGlobal() || "
-  "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">;
-
-let Predicates = [useFlatForGlobal] in {
-
-// 1. Offset as 20bit DWORD immediate
-def : Pat <
-  (SIload_constant v4i32:$sbase, IMM20bit:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
->;
+let Predicates = [isCIVI] in {

 // Patterns for global loads with no offset
 class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
@ -341,24 +295,24 @@ class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
  (inst $addr, 0, 0, 0)
 >;

-def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>;

 class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
  (node vt:$data, i64:$addr),
  (inst $data, $addr, 0, 0, 0)
 >;

-def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>;
-def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>;
+def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>;
+def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>;

 class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat <
  (vt (node i64:$addr, vt:$data)),
@ -376,4 +330,4 @@ def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
 def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;

-} // End Predicates = [useFlatForGlobal]
+} // End Predicates = [isCIVI]
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@ -105,51 +105,53 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
    MBB.addLiveIn(PreloadedPrivateBufferReg);
  }

-  // We reserved the last registers for this. Shift it down to the end of those
-  // which were actually used.
-  //
-  // FIXME: It might be safer to use a pseudoregister before replacement.
+  if (!ST.hasSGPRInitBug()) {
+    // We reserved the last registers for this. Shift it down to the end of those
+    // which were actually used.
+    //
+    // FIXME: It might be safer to use a pseudoregister before replacement.

-  // FIXME: We should be able to eliminate unused input registers. We only
-  // cannot do this for the resources required for scratch access. For now we
-  // skip over user SGPRs and may leave unused holes.
+    // FIXME: We should be able to eliminate unused input registers. We only
+    // cannot do this for the resources required for scratch access. For now we
+    // skip over user SGPRs and may leave unused holes.

-  // We find the resource first because it has an alignment requirement.
-  if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
-    MachineRegisterInfo &MRI = MF.getRegInfo();
+    // We find the resource first because it has an alignment requirement.
+    if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
+      MachineRegisterInfo &MRI = MF.getRegInfo();

-    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
-    // Skip the last 2 elements because the last one is reserved for VCC, and
-    // this is the 2nd to last element already.
-    for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
-      // Pick the first unallocated one. Make sure we don't clobber the other
-      // reserved input we needed.
-      if (!MRI.isPhysRegUsed(Reg)) {
-        assert(MRI.isAllocatable(Reg));
-        MRI.replaceRegWith(ScratchRsrcReg, Reg);
-        ScratchRsrcReg = Reg;
-        MFI->setScratchRSrcReg(ScratchRsrcReg);
-        break;
+      unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+      // Skip the last 2 elements because the last one is reserved for VCC, and
+      // this is the 2nd to last element already.
+      for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
+        // Pick the first unallocated one. Make sure we don't clobber the other
+        // reserved input we needed.
+        if (!MRI.isPhysRegUsed(Reg)) {
+          assert(MRI.isAllocatable(Reg));
+          MRI.replaceRegWith(ScratchRsrcReg, Reg);
+          ScratchRsrcReg = Reg;
+          MFI->setScratchRSrcReg(ScratchRsrcReg);
+          break;
+        }
      }
    }
-  }

-  if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
-    MachineRegisterInfo &MRI = MF.getRegInfo();
-    // Skip the last 2 elements because the last one is reserved for VCC, and
-    // this is the 2nd to last element already.
-    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
-    for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
-      // Pick the first unallocated SGPR. Be careful not to pick an alias of the
-      // scratch descriptor, since we haven’t added its uses yet.
-      if (!MRI.isPhysRegUsed(Reg)) {
-        assert(MRI.isAllocatable(Reg) &&
-               !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+    if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      // Skip the last 2 elements because the last one is reserved for VCC, and
+      // this is the 2nd to last element already.
+      unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+      for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+        // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+        // scratch descriptor, since we haven’t added its uses yet.
+        if (!MRI.isPhysRegUsed(Reg)) {
+          assert(MRI.isAllocatable(Reg) &&
+                !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));

-        MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
-        ScratchWaveOffsetReg = Reg;
-        MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
-        break;
+          MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+          ScratchWaveOffsetReg = Reg;
+          MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+          break;
+        }
      }
    }
  }
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@ -134,6 +134,34 @@ def SIconstdata_ptr : SDNode<
                                                     SDTCisVT<0, i64>]>
 >;

+//===----------------------------------------------------------------------===//
+// PatFrags for FLAT instructions
+//===----------------------------------------------------------------------===//
+
+class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
+                                               (ld node:$ptr), [{
+  return isFlatLoad(dyn_cast<LoadSDNode>(N)) ||
+         isGlobalLoad(dyn_cast<LoadSDNode>(N)) ||
+         isConstantLoad(cast<LoadSDNode>(N), -1);
+}]>;
+
+def flat_load : flat_ld <load>;
+def flat_az_extloadi8 : flat_ld <az_extloadi8>;
+def flat_sextloadi8 : flat_ld <sextloadi8>;
+def flat_az_extloadi16 : flat_ld <az_extloadi16>;
+def flat_sextloadi16 : flat_ld <sextloadi16>;
+
+class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
+                                               (st node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N)) ||
+         isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def flat_store: flat_st <store>;
+def flat_truncstorei8 : flat_st <truncstorei8>;
+def flat_truncstorei16 : flat_st <truncstorei16>;
+
+
 def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
 	return isGlobalLoad(cast<LoadSDNode>(N)) ||
         isConstantLoad(cast<LoadSDNode>(N), -1);
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@ -59,8 +59,6 @@ defm EXP : EXP_m;
 // SMRD Instructions
 //===----------------------------------------------------------------------===//

-let mayLoad = 1 in {
-
 // We are using the SGPR_32 and not the SReg_32 register class for 32-bit
 // SMRD instructions, because the SGPR_32 register class does not include M0
 // and writing to M0 from an SMRD instruction will hang the GPU.
@ -90,8 +88,6 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
  smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512
 >;

-} // mayLoad = 1
-
 //def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>;

 defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv",
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -156,6 +156,17 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(

  if (!LaneVGPRs.count(LaneVGPRIdx)) {
    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass);
+
+    if (LaneVGPR == AMDGPU::NoRegister) {
+      LLVMContext &Ctx = MF->getFunction()->getContext();
+      Ctx.emitError("Ran out of VGPRs for spilling SGPR");
+
+      // When compiling from inside Mesa, the compilation continues.
+      // Select an arbitrary register to avoid triggering assertions
+      // during subsequent passes.
+      LaneVGPR = AMDGPU::VGPR0;
+    }
+
    LaneVGPRs[LaneVGPRIdx] = LaneVGPR;

    // Add this register as live-in to all blocks to avoid machine verifer
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -37,13 +37,17 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
  if (ST.hasSGPRInitBug()) {
    unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
+    if (ST.isXNACKEnabled())
+      BaseIdx -= 4;
+
    unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
    return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
  }

  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the
-    // next sgpr128 down.
+    // 98/99 need to be reserved for flat_scr or 96/97 for flat_scr and
+    // 98/99 for xnack_mask, and 100/101 for vcc. This is the next sgpr128 down
+    // either way.
    return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
  }

@ -54,13 +58,25 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
  const MachineFunction &MF) const {
  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
  if (ST.hasSGPRInitBug()) {
-    unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+    unsigned Idx;
+
+    if (!ST.isXNACKEnabled())
+      Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
+    else
+      Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
+
    return AMDGPU::SGPR_32RegClass.getRegister(Idx);
  }

  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    // Next register before reservations for flat_scr and vcc.
-    return AMDGPU::SGPR97;
+    if (!ST.isXNACKEnabled()) {
+      // Next register before reservations for flat_scr and vcc.
+      return AMDGPU::SGPR97;
+    } else {
+      // Next register before reservations for flat_scr, xnack_mask, vcc,
+      // and scratch resource.
+      return AMDGPU::SGPR91;
+    }
  }

  return AMDGPU::SGPR95;
@ -86,6 +102,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
    // for VCC/FLAT_SCR.
    reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
    reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
+
+    if (ST.isXNACKEnabled())
+      reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
  }

  // Tonga and Iceland can only allocate a fixed number of SGPRs due
@ -93,9 +112,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
  if (ST.hasSGPRInitBug()) {
    unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
    // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs).
-    // Assume XNACK_MASK is unused.
    unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4;

+    if (ST.isXNACKEnabled())
+      Limit -= 2;
+
    for (unsigned i = Limit; i < NumSGPRs; ++i) {
      unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
      reserveRegisterTuples(Reserved, Reg);
@ -282,11 +303,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
        struct SIMachineFunctionInfo::SpilledReg Spill =
            MFI->getSpilledReg(MF, Index, i);

-        if (Spill.VGPR == AMDGPU::NoRegister) {
-           LLVMContext &Ctx = MF->getFunction()->getContext();
-           Ctx.emitError("Ran out of VGPRs for spilling SGPR");
-        }
-
        BuildMI(*MBB, MI, DL,
                TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
                Spill.VGPR)
@ -315,11 +331,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
        struct SIMachineFunctionInfo::SpilledReg Spill =
            MFI->getSpilledReg(MF, Index, i);

-        if (Spill.VGPR == AMDGPU::NoRegister) {
-           LLVMContext &Ctx = MF->getFunction()->getContext();
-           Ctx.emitError("Ran out of VGPRs for spilling SGPR");
-        }
-
        BuildMI(*MBB, MI, DL,
                TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
                SubReg)
--- a/lib/Target/AMDGPU/VIInstructions.td
+++ b/lib/Target/AMDGPU/VIInstructions.td
@ -101,3 +101,12 @@ def S_DCACHE_WB_VOL : SMEM_Inval <0x23,

 } // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI

+let Predicates = [isVI] in {
+
+// 1. Offset as 20bit DWORD immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, IMM20bit:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset))
+>;
+
+} // End Predicates = [isVI]
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@ -252,6 +252,8 @@ def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
 def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
                                   "Swift ARM processors", []>;

+def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
+                                    "Samsung Exynos-M1 processors", []>;

 def ProcR4      : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4",
                                    "Cortex-R4 ARM processors", []>;
@ -649,6 +651,12 @@ def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
                                                         FeatureCrypto,
                                                         FeatureZCZeroing]>;

+def : ProcNoItin<"exynos-m1",                           [ARMv8a, ProcExynosM1,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureT2XtPk,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;

 //===----------------------------------------------------------------------===//
 // Register File Description
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@ -340,12 +340,12 @@ namespace {
 /// verify - check BBOffsets, BBSizes, alignment of islands
 void ARMConstantIslands::verify() {
 #ifndef NDEBUG
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock *MBB = &*MBBI;
-    unsigned MBBId = MBB->getNumber();
-    assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
-  }
+  assert(std::is_sorted(MF->begin(), MF->end(),
+                        [this](const MachineBasicBlock &LHS,
+                               const MachineBasicBlock &RHS) {
+                          return BBInfo[LHS.getNumber()].postOffset() <
+                                 BBInfo[RHS.getNumber()].postOffset();
+                        }));
  DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
    CPUser &U = CPUsers[i];
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@ -1986,23 +1986,6 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
  return AddedRegPressure.size() <= MemRegs.size() * 2;
 }

-
-/// Copy \p Op0 and \p Op1 operands into a new array assigned to MI.
-static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
-                                   MachineInstr *Op1) {
-  assert(MI->memoperands_empty() && "expected a new machineinstr");
-  size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin())
-    + (Op1->memoperands_end() - Op1->memoperands_begin());
-
-  MachineFunction *MF = MI->getParent()->getParent();
-  MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
-  MachineSDNode::mmo_iterator MemEnd =
-    std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
-  MemEnd =
-    std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
-  MI->setMemRefs(MemBegin, MemEnd);
-}
-
 bool
 ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
                                          DebugLoc &dl, unsigned &NewOpc,
@ -2196,7 +2179,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
            if (!isT2)
              MIB.addReg(0);
            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
-            concatenateMemOperands(MIB, Op0, Op1);
+            MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
            DEBUG(dbgs() << "Formed " << *MIB << "\n");
            ++NumLDRDFormed;
          } else {
@ -2210,7 +2193,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
            if (!isT2)
              MIB.addReg(0);
            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
-            concatenateMemOperands(MIB, Op0, Op1);
+            MIB.setMemRefs(Op0->mergeMemRefsWith(*Op1));
            DEBUG(dbgs() << "Formed " << *MIB << "\n");
            ++NumSTRDFormed;
          }
--- a/Show more
+++ b/Show more