Skip to content

Contact Identification

Responsible for identifying all non-covalent interactions in a trajectory.

This bypasses the need for the install of PyContact and reproduces the scoring function used by PyContact.

Output will be a csv file with each column an interaction pair. Each column has the following format: [residue1][residue2] [interaction type]

Where "residue1" and "residue2" are the names and residue numbers of the pair. and "interaction type" is one of: "Hbond" - Hydrogen bond "Saltbr" - Salt Bridge "Hydrophobic" - VdW interaction between two hydrophobic residues. "VdW" - Unspecified VdW interaction.

calculate_contacts(parm_file, traj_file, out_file, first_res=None, last_res=None, report_timings=True)

Identify all non-covalent interactions present in the simulation and save the output. Output has each non-covalent interaction as a column each column has the following information: [residue1][residue2] [interaction type]

Parameters:

Name Type Description Default
parm_file str

The file path to your topology file. All MDAnalysis allowed topologies can be used. Please do not use a PDB file for this, use something with charge information. This is important for the hydrogen bonding part of the calculation to work.

required
traj_file str

The file path to your trajectory file. All MDAnalysis allowed trajectory file types can be used.

required
out_file str

Path to write the csv output file too.

required
first_res Optional[int]

First residue to analyse, useful if you want to break the analysis into blocks. If not provided, the first residue in the trajectory will be used.

None
last_res Optional[int]

Last residue to analyse, useful if you want to break the analysis into blocks. If not provided, the last residue in the trajectory will be used.

None
report_timings bool

Choose whether to print to the console how long the job took to run. Optional, default is True.

True

Returns:

Type Description
None

Output written to file. Optional timings printed to the console.

Source code in key_interactions_finder/contact_identification.py
def calculate_contacts(
    parm_file: str,
    traj_file: str,
    out_file: str,
    first_res: Optional[int] = None,
    last_res: Optional[int] = None,
    report_timings: bool = True,
) -> None:
    """

    Identify all non-covalent interactions present in the simulation and save the output.
    Output has each non-covalent interaction as a column
    each column has the following information: [residue1] [residue2] [interaction type]

    Parameters
    ----------

    parm_file: str
        The file path to your topology file. All MDAnalysis allowed topologies can be used.
        Please do not use a PDB file for this, use something with charge information.
        This is important for the hydrogen bonding part of the calculation to work.

    traj_file: str
        The file path to your trajectory file.
        All MDAnalysis allowed trajectory file types can be used.

    out_file: str
        Path to write the csv output file too.

    first_res: Optional[int]
        First residue to analyse, useful if you want to break the analysis into blocks.
        If not provided, the first residue in the trajectory will be used.

    last_res: Optional[int]
        Last residue to analyse, useful if you want to break the analysis into blocks.
        If not provided, the last residue in the trajectory will be used.

    report_timings: bool = True
        Choose whether to print to the console how long the job took to run.
        Optional, default is True.

    Returns
    -------

    None
        Output written to file. Optional timings printed to the console.
    """
    if report_timings:
        start_time = time.monotonic()

    universe = Universe(parm_file, traj_file)

    if first_res is None:
        first_res = 1

    biggest_protein_res = max(universe.select_atoms("name CA").resids)
    if last_res is None:
        last_res = biggest_protein_res

    if last_res > biggest_protein_res:
        warning_message = f"""
            You stated your last residue was residue number: {last_res}.
            I found the last protein residue to be: {biggest_protein_res}. \n
            This program is primarily designed to analyse protein-protein interactions, but can be used
            on ligands, or even water molecules \n
            If the difference between the {last_res} and {biggest_protein_res} is very large, then check you have not
            included all water molecules in your calculation. Doing so will make this calculation a lot slower. \n

            If the difference is small, you can safely ignore this warning message as it is probably the case you just
            have a ligand or two in your input file.
        """
        warnings.warn(warning_message, stacklevel=2)

        biggest_res = last_res
    else:
        biggest_res = biggest_protein_res

    trajectory_length = len(universe.trajectory)
    trajectory_of_zeros = np.zeros(trajectory_length)

    all_heavy_atoms_sele = "not name H* and resid 1" + "-" + str(biggest_res)
    all_heavy_atoms = universe.select_atoms(all_heavy_atoms_sele)
    residue_names = [names.capitalize() for names in list(universe.residues.resnames)]

    # determine which residue each heavy atom belongs to.
    residue_ranges = {}
    for res_numb in range(1, biggest_res + 1):
        residue_range = np.where(all_heavy_atoms.atoms.resids == res_numb)
        residue_ranges[res_numb] = residue_range

    print("setup complete, analysing contacts now...")
    hbond_pairs = _determine_hbond_pairs(universe=universe)

    # Now go through each frame.
    all_contact_scores = {}
    for idx, _ in enumerate(universe.trajectory):  # each step is new frame.
        # calculate all heavy atom distances for this trajectory
        heavy_atom_dists = distances.distance_array(
            all_heavy_atoms.positions,
            all_heavy_atoms.positions,
        )

        for res1 in range(first_res, last_res + 1):
            res_dists = heavy_atom_dists[residue_ranges[res1]]

            # +3 here as neighbouring residues not interesting.
            for res2 in range(res1 + 3, biggest_res + 1):
                res_res_dists = res_dists[:, residue_ranges[res2]]

                # score would be 0 if true.
                if res_res_dists.min() > MAX_HEAVY_DIST:
                    continue

                contact_score = _score_residue_contact(res_res_dists)
                if (res1, res2) not in all_contact_scores:
                    # create empty array of size trajectory for it...
                    all_contact_scores[(res1, res2)] = trajectory_of_zeros.copy()
                # now update score for this frame.
                all_contact_scores[(res1, res2)][idx] = contact_score

    contact_labels_scores = {}
    for res_pair, contact_scores in all_contact_scores.items():
        # save contact only if non-negligible interaction present
        avg_contact_score = sum(contact_scores) / trajectory_length
        if avg_contact_score < 0.1:
            continue

        res1, res2 = res_pair
        # -1 as 0 indexed...
        res1_name = residue_names[res1 - 1]
        res2_name = residue_names[res2 - 1]

        interaction_type = _determine_interaction_type(
            res1_id=res1,
            res2_id=res2,
            hbond_pairs=hbond_pairs,
            universe=universe,
        )

        contact_label = str(res1) + res1_name + " " + str(res2) + res2_name + " " + interaction_type
        contact_labels_scores.update({contact_label: contact_scores})

    # reorders column names, to be like the old format.
    sorted_dict = dict(
        sorted(
            contact_labels_scores.items(),
            key=lambda item: (int("".join(filter(str.isdigit, item[0].split(":")[0])))),
        )
    )

    df_contact_scores = pd.DataFrame(sorted_dict)
    df_contact_scores.to_csv(out_file, index=False)

    if report_timings:
        end_time = time.monotonic()
        delta_time = timedelta(seconds=end_time - start_time)
        print(f"Time taken: {delta_time}")